In [212]:
# Load libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix
from sklearn.model_selection import GridSearchCV
import re
import ipaddress
from IPython.display import display, HTML
from cyberpandas import IPArray, to_ipaddress
from datetime import datetime
import os
import time
import json
import csv
import plotly.express as px
import plotly.io as pio
In [75]:
%%HTML
<!-- Set dataframe style -->
<style>.dataframe th{
background:#3f577c; 
font-family:monospace; 
color:white; 
border:3px solid white; 
text-align:left !important;}
</style>
In [76]:
# Load raw data
t1 = time.perf_counter()
dir = 'C:/Users/micha/Downloads/flows'
ct = 0
for fn in os.listdir(dir):

    # Build path and read data frame
    ct += 1
    fullpath = os.path.join(dir, fn)
    print("Reading", fn)
    df = pd.read_csv(fullpath, encoding='latin1', low_memory=False, converters={0: str, 1: str, 3: str, 6: str, 84: str})
    #df = pd.read_csv(fullpath, encoding='latin1', low_memory=False)

    # Shape
    print("Shape before data wrangling:", df.shape)
    
    # Most columns have a leading space in their name; strip all column names
    for i in range(0, df.shape[1]):
        df.rename({df.columns[i]: df.columns[i].strip()}, axis = 1, inplace = True)

    # Rename first columns by hand
    df.rename(columns = {'Source IP': 'srcip', 'Destination IP': 'dstip', 'Protocol': 'proto'}, inplace = True)

    # Rename other columns to remove space and make lowercase
    dRename = {
        'source': 'src',
        'destination': 'dst',
        'protocol': 'proto',
        'bytes/s': 'bps',
        'packets/s': 'pps',
        'packet': 'pkt',
        'packets': 'pkt',
        'length': 'len',
        'header': 'hdr',
        'total': 'tot',
        'count': 'ct',
        'average': 'avg',
        'variance': 'var',
        'size': 'sz',
        'forward': 'fwd',
        'backward': 'bwd',
        'segment': 'seg',
        ' ': '_',
        '/': '_',
        '.': '_'
    }
    for i in range(0, df.shape[1]):
        df.rename({df.columns[i]: df.columns[i].lower()}, axis = 1, inplace = True)
        for e in dRename.keys():
            df.rename({df.columns[i]: df.columns[i].replace(e, dRename[e])}, axis = 1, inplace = True)
            
    # Add filename
    df['fn'] = fn

    # Append data to df2
    if ct == 1:
        df2 = df.copy()
    else:
        df2 = pd.concat([df2, df])
    print("Shape after concatenating:", df2.shape)

    # Remove the dataframe from memory
    del df
    print()    

print("Done reading data")
Reading Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Shape before data wrangling: (225745, 85)
Shape after concatenating: (225745, 86)

Reading Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
Shape before data wrangling: (286467, 85)
Shape after concatenating: (512212, 86)

Reading Friday-WorkingHours-Morning.pcap_ISCX.csv
Shape before data wrangling: (191033, 85)
Shape after concatenating: (703245, 86)

Reading Monday-WorkingHours.pcap_ISCX.csv
Shape before data wrangling: (529918, 85)
Shape after concatenating: (1233163, 86)

Reading Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Shape before data wrangling: (288602, 85)
Shape after concatenating: (1521765, 86)

Reading Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
Shape before data wrangling: (170366, 85)
Shape after concatenating: (1692131, 86)

Reading Tuesday-WorkingHours.pcap_ISCX.csv
Shape before data wrangling: (445909, 85)
Shape after concatenating: (2138040, 86)

Reading Wednesday-workingHours.pcap_ISCX.csv
Shape before data wrangling: (692703, 85)
Shape after concatenating: (2830743, 86)

Done reading data
In [263]:
# Additional data wrangling on combined data frame

# Convert proto column to categorical
df2.loc[df2['proto'] == 1, 'proto'] = 'icmp'
df2.loc[df2['proto'] == 6, 'proto'] = 'tcp'
df2.loc[df2['proto'] == 17, 'proto'] = 'udp'

# Convert timestamp field from this format: "7/7/2017 8:59"
# to ISO 8601: "2017-07-07 08:59:00"
# "%d/%m/%Y %H:%M"
df2['timestamp'] = pd.to_datetime(df2['timestamp'])

# Replace '\x96' (en dash) in labels
df2['label'] = df2['label'].str.replace('\x96', '-', regex=False)

# Split out day of week
df2.loc[df2.fn.str.contains('^Sunday'), 'day'] = 'Sun'
df2.loc[df2.fn.str.contains('^Monday'), 'day'] = 'Mon'
df2.loc[df2.fn.str.contains('^Tuesday'), 'day'] = 'Tue'
df2.loc[df2.fn.str.contains('^Wednesday'), 'day'] = 'Wed'
df2.loc[df2.fn.str.contains('^Thursday'), 'day'] = 'Thu'
df2.loc[df2.fn.str.contains('^Friday'), 'day'] = 'Fri'
df2.loc[df2.fn.str.contains('^Saturday'), 'day'] = 'Sat'

# Get dummies for categorical variable "day"
tmpDays = pd.get_dummies(df2['day'], drop_first=True)
df2 = df2.drop(['day'], axis=1)
df2 = pd.concat([df2, tmpDays], axis=1)

# Drop records with protocol 0 or na
df2 = df2[df2['proto'] != 0]

# Get dummies for categorical variable "proto"
tmpProto = pd.get_dummies(df2['proto'], drop_first=True)
df2 = df2.drop(['proto'], axis=1)
df2 = pd.concat([df2, tmpProto], axis=1)
In [264]:
# Explore missing or unexpected data

display(df2)

# Explore missing data
dfmissing = df2[df2.isna().any(axis = 1)]
numMissing = df2.isna().any(axis = 1).sum()
print("# rows with na's: " + str(numMissing) + " (" + str(round(100 * numMissing / df2.shape[0], 3)) + "%)")
display(dfmissing)
flow_id srcip src_port dstip dst_port timestamp flow_duration tot_fwd_pkts tot_bwd_pkts tot_len_of_fwd_pkts tot_len_of_bwd_pkts fwd_pkt_len_max fwd_pkt_len_min fwd_pkt_len_mean fwd_pkt_len_std bwd_pkt_len_max bwd_pkt_len_min bwd_pkt_len_mean bwd_pkt_len_std flow_bps flow_pps flow_iat_mean flow_iat_std flow_iat_max flow_iat_min fwd_iat_tot fwd_iat_mean fwd_iat_std fwd_iat_max fwd_iat_min bwd_iat_tot bwd_iat_mean bwd_iat_std bwd_iat_max bwd_iat_min fwd_psh_flags bwd_psh_flags fwd_urg_flags bwd_urg_flags fwd_hdr_len bwd_hdr_len fwd_pps bwd_pps min_pkt_len max_pkt_len pkt_len_mean pkt_len_std pkt_len_var fin_flag_ct syn_flag_ct rst_flag_ct psh_flag_ct ack_flag_ct urg_flag_ct cwe_flag_ct ece_flag_ct down_up_ratio avg_pkt_sz avg_fwd_seg_sz avg_bwd_seg_sz fwd_hdr_len_1 fwd_avg_bytes_bulk fwd_avg_pkts_bulk fwd_avg_bulk_rate bwd_avg_bytes_bulk bwd_avg_pkts_bulk bwd_avg_bulk_rate subflow_fwd_pkts subflow_fwd_bytes subflow_bwd_pkts subflow_bwd_bytes init_win_bytes_fwd init_win_bytes_bwd act_data_pkt_fwd min_seg_sz_fwd active_mean active_std active_max active_min idle_mean idle_std idle_max idle_min label fn Mon Thu Tue Wed udp
0 192.168.10.5-104.16.207.165-54865-443-6 104.16.207.165 443 192.168.10.5 54865 2017-07-07 03:30:00 3 2 0 12.0 0.0 6.0 6.0 6.0 0.00000 0.0 0.0 0.0 0.0 4.000000e+06 666666.666700 3.0 0.000000 3.0 3.0 3.0 3.00000 0.00000 3.0 3.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 40 0 666666.666700 0.000000 6.0 6.0 6.000000 0.000000 0.000000 0 0 0 0 1 0 0 0 0.0 9.000000 6.0 0.0 40 0 0 0 0 0 0 2 12 0 0 33 -1 1 20 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 BENIGN Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv 0 0 0 0 0
1 192.168.10.5-104.16.28.216-55054-80-6 104.16.28.216 80 192.168.10.5 55054 2017-07-07 03:30:00 109 1 1 6.0 6.0 6.0 6.0 6.0 0.00000 6.0 6.0 6.0 0.0 1.100917e+05 18348.623850 109.0 0.000000 109.0 109.0 0.0 0.00000 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 20 20 9174.311927 9174.311927 6.0 6.0 6.000000 0.000000 0.000000 0 0 0 0 1 1 0 0 1.0 9.000000 6.0 6.0 20 0 0 0 0 0 0 1 6 1 6 29 256 0 20 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 BENIGN Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv 0 0 0 0 0
2 192.168.10.5-104.16.28.216-55055-80-6 104.16.28.216 80 192.168.10.5 55055 2017-07-07 03:30:00 52 1 1 6.0 6.0 6.0 6.0 6.0 0.00000 6.0 6.0 6.0 0.0 2.307692e+05 38461.538460 52.0 0.000000 52.0 52.0 0.0 0.00000 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 20 20 19230.769230 19230.769230 6.0 6.0 6.000000 0.000000 0.000000 0 0 0 0 1 1 0 0 1.0 9.000000 6.0 6.0 20 0 0 0 0 0 0 1 6 1 6 29 256 0 20 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 BENIGN Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv 0 0 0 0 0
3 192.168.10.16-104.17.241.25-46236-443-6 104.17.241.25 443 192.168.10.16 46236 2017-07-07 03:30:00 34 1 1 6.0 6.0 6.0 6.0 6.0 0.00000 6.0 6.0 6.0 0.0 3.529412e+05 58823.529410 34.0 0.000000 34.0 34.0 0.0 0.00000 0.00000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 20 20 29411.764710 29411.764710 6.0 6.0 6.000000 0.000000 0.000000 0 0 0 0 1 1 0 0 1.0 9.000000 6.0 6.0 20 0 0 0 0 0 0 1 6 1 6 31 329 0 20 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 BENIGN Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv 0 0 0 0 0
4 192.168.10.5-104.19.196.102-54863-443-6 104.19.196.102 443 192.168.10.5 54863 2017-07-07 03:30:00 3 2 0 12.0 0.0 6.0 6.0 6.0 0.00000 0.0 0.0 0.0 0.0 4.000000e+06 666666.666700 3.0 0.000000 3.0 3.0 3.0 3.00000 0.00000 3.0 3.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 40 0 666666.666700 0.000000 6.0 6.0 6.000000 0.000000 0.000000 0 0 0 0 1 0 0 0 0.0 9.000000 6.0 0.0 40 0 0 0 0 0 0 2 12 0 0 32 -1 1 20 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 BENIGN Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
692698 192.168.10.3-192.168.10.14-53-51114-17 192.168.10.14 51114 192.168.10.3 53 2017-05-07 12:10:00 32215 4 2 112.0 152.0 28.0 28.0 28.0 0.00000 76.0 76.0 76.0 0.0 8.194940e+03 186.248642 6443.0 13617.579480 30780.0 3.0 30832.0 10277.33333 17755.84381 30780.0 4.0 3.0 3.0 0.0 3.0 3.0 0 0 0 0 80 64 124.165761 62.082881 28.0 76.0 41.714286 23.421602 548.571429 0 0 0 0 0 0 0 0 0.0 48.666667 28.0 76.0 80 0 0 0 0 0 0 4 112 2 152 -1 -1 3 20 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 BENIGN Wednesday-workingHours.pcap_ISCX.csv 0 0 0 1 1
692699 192.168.10.3-192.168.10.16-53-24054-17 192.168.10.16 24054 192.168.10.3 53 2017-05-07 03:02:00 324 2 2 84.0 362.0 42.0 42.0 42.0 0.00000 181.0 181.0 181.0 0.0 1.376543e+06 12345.679010 108.0 183.597386 320.0 2.0 2.0 2.00000 0.00000 2.0 2.0 2.0 2.0 0.0 2.0 2.0 0 0 0 0 40 40 6172.839506 6172.839506 42.0 181.0 97.600000 76.133435 5796.300000 0 0 0 0 0 0 0 0 1.0 122.000000 42.0 181.0 40 0 0 0 0 0 0 2 84 2 362 -1 -1 1 20 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 BENIGN Wednesday-workingHours.pcap_ISCX.csv 0 0 0 1 1
692700 192.168.10.51-23.208.163.130-58030-443-6 23.208.163.130 443 192.168.10.51 58030 2017-05-07 10:06:00 82 2 1 31.0 6.0 31.0 0.0 15.5 21.92031 6.0 6.0 6.0 0.0 4.512195e+05 36585.365850 41.0 52.325902 78.0 4.0 4.0 4.00000 0.00000 4.0 4.0 0.0 0.0 0.0 0.0 0.0 1 0 0 0 64 20 24390.243900 12195.121950 0.0 31.0 17.000000 16.350331 267.333333 0 1 0 0 1 0 0 0 0.0 22.666667 15.5 6.0 64 0 0 0 0 0 0 2 31 1 6 1006 0 0 32 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 BENIGN Wednesday-workingHours.pcap_ISCX.csv 0 0 0 1 0
692701 192.168.10.3-192.168.10.14-53-51694-17 192.168.10.14 51694 192.168.10.3 53 2017-05-07 01:19:00 1048635 6 2 192.0 256.0 32.0 32.0 32.0 0.00000 128.0 128.0 128.0 0.0 4.272221e+02 7.628965 149805.0 375521.040500 1000947.0 1.0 1033613.0 206722.60000 444210.06860 1000947.0 1.0 3.0 3.0 0.0 3.0 3.0 0 0 0 0 120 40 5.721724 1.907241 32.0 128.0 53.333333 42.332021 1792.000000 0 0 0 0 0 0 0 0 0.0 60.000000 32.0 128.0 120 0 0 0 0 0 0 6 192 2 256 -1 -1 5 20 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 BENIGN Wednesday-workingHours.pcap_ISCX.csv 0 0 0 1 1
692702 192.168.10.3-192.168.10.14-53-57949-17 192.168.10.14 57949 192.168.10.3 53 2017-05-07 02:43:00 94939 4 2 188.0 226.0 47.0 47.0 47.0 0.00000 113.0 113.0 113.0 0.0 4.360695e+03 63.198475 18987.8 31664.102560 73049.0 1.0 73051.0 24350.33333 42174.28246 73049.0 1.0 48.0 48.0 0.0 48.0 48.0 0 0 0 0 104 64 42.132317 21.066158 47.0 113.0 65.857143 32.204702 1037.142857 0 0 0 0 0 0 0 0 0.0 76.833333 47.0 113.0 104 0 0 0 0 0 0 4 188 2 226 -1 -1 3 20 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 BENIGN Wednesday-workingHours.pcap_ISCX.csv 0 0 0 1 1

2829047 rows × 90 columns

# rows with na's: 1358 (0.048%)
flow_id srcip src_port dstip dst_port timestamp flow_duration tot_fwd_pkts tot_bwd_pkts tot_len_of_fwd_pkts tot_len_of_bwd_pkts fwd_pkt_len_max fwd_pkt_len_min fwd_pkt_len_mean fwd_pkt_len_std bwd_pkt_len_max bwd_pkt_len_min bwd_pkt_len_mean bwd_pkt_len_std flow_bps flow_pps flow_iat_mean flow_iat_std flow_iat_max flow_iat_min fwd_iat_tot fwd_iat_mean fwd_iat_std fwd_iat_max fwd_iat_min bwd_iat_tot bwd_iat_mean bwd_iat_std bwd_iat_max bwd_iat_min fwd_psh_flags bwd_psh_flags fwd_urg_flags bwd_urg_flags fwd_hdr_len bwd_hdr_len fwd_pps bwd_pps min_pkt_len max_pkt_len pkt_len_mean pkt_len_std pkt_len_var fin_flag_ct syn_flag_ct rst_flag_ct psh_flag_ct ack_flag_ct urg_flag_ct cwe_flag_ct ece_flag_ct down_up_ratio avg_pkt_sz avg_fwd_seg_sz avg_bwd_seg_sz fwd_hdr_len_1 fwd_avg_bytes_bulk fwd_avg_pkts_bulk fwd_avg_bulk_rate bwd_avg_bytes_bulk bwd_avg_pkts_bulk bwd_avg_bulk_rate subflow_fwd_pkts subflow_fwd_bytes subflow_bwd_pkts subflow_bwd_bytes init_win_bytes_fwd init_win_bytes_bwd act_data_pkt_fwd min_seg_sz_fwd active_mean active_std active_max active_min idle_mean idle_std idle_max idle_min label fn Mon Thu Tue Wed udp
6796 192.168.10.16-198.54.12.145-36812-80-6 198.54.12.145 80 192.168.10.16 36812 2017-07-07 03:35:00 0 2 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN inf 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 64 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 0 0 0 0.0 0.0 0.0 0.0 64 0 0 0 0 0 0 2 0 0 0 7633 -1 0 32 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 BENIGN Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv 0 0 0 0 0
14739 192.168.10.25-192.168.10.50-53581-37575-6 192.168.10.50 37575 192.168.10.25 53581 2017-07-07 03:46:00 0 1 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN inf 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 44 32 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 0 0 0 1.0 0.0 0.0 0.0 44 0 0 0 0 0 0 1 0 1 0 408 65535 0 44 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 BENIGN Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv 0 0 0 0 0
15047 192.168.10.17-1.1.70.73-48283-80-6 192.168.10.17 48283 1.1.70.73 80 2017-07-07 03:48:00 0 2 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN inf 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 64 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 0 0 0 0.0 0.0 0.0 0.0 64 0 0 0 0 0 0 2 0 0 0 274 -1 0 32 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 BENIGN Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv 0 0 0 0 0
209728 192.168.10.17-192.168.10.50-39026-18467-6 192.168.10.17 39026 192.168.10.50 18467 2017-07-07 04:34:00 0 2 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN inf 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 64 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 0 0 0 0.0 0.0 0.0 0.0 64 0 0 0 0 0 0 2 0 0 0 229 -1 0 32 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 BENIGN Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv 0 0 0 0 0
12824 192.168.10.25-192.168.10.50-52509-13370-6 192.168.10.25 52509 192.168.10.50 13370 2017-07-07 01:30:00 0 2 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN inf 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 64 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 1 0 0 0.0 0.0 0.0 0.0 64 0 0 0 0 0 0 2 0 0 0 65535 -1 0 32 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 BENIGN Friday-WorkingHours-Afternoon-PortScan.pcap_IS... 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
602009 172.217.10.34-192.168.10.25-443-50834-6 172.217.10.34 443 192.168.10.25 50834 2017-05-07 10:59:00 0 1 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN inf 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 32 32 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 1 0 0 1.0 0.0 0.0 0.0 32 0 0 0 0 0 0 1 0 1 0 426 65535 0 32 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 BENIGN Wednesday-workingHours.pcap_ISCX.csv 0 0 0 1 0
629492 172.217.12.162-192.168.10.17-443-45113-6 172.217.12.162 443 192.168.10.17 45113 2017-05-07 01:33:00 0 1 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN inf 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 32 32 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 1 0 0 1.0 0.0 0.0 0.0 32 0 0 0 0 0 0 1 0 1 0 413 535 0 32 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 BENIGN Wednesday-workingHours.pcap_ISCX.csv 0 0 0 1 0
653553 160.68.117.59-192.168.10.12-443-48698-6 160.68.117.59 443 192.168.10.12 48698 2017-05-07 10:10:00 0 2 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN inf 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 64 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 0 0 0 0.0 0.0 0.0 0.0 64 0 0 0 0 0 0 2 0 0 0 5072 -1 0 32 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 BENIGN Wednesday-workingHours.pcap_ISCX.csv 0 0 0 1 0
671012 192.168.10.50-192.168.10.51-22567-46394-6 192.168.10.51 46394 192.168.10.50 22567 2017-05-07 11:25:00 0 2 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN inf 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 64 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 0 0 0 0.0 0.0 0.0 0.0 64 0 0 0 0 0 0 2 0 0 0 229 -1 0 32 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 BENIGN Wednesday-workingHours.pcap_ISCX.csv 0 0 0 1 0
686806 185.86.139.29-192.168.10.16-443-33238-6 185.86.139.29 443 192.168.10.16 33238 2017-05-07 09:55:00 0 1 1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN inf 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 32 32 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 0 0 0 1 0 0 0 1.0 0.0 0.0 0.0 32 0 0 0 0 0 0 1 0 1 0 7635 40764 0 32 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 BENIGN Wednesday-workingHours.pcap_ISCX.csv 0 0 0 1 0

1358 rows × 90 columns

In [265]:
# Look for infinite values (which PCA will choke on)
for i in range(0, df2.shape[1]):
    t = df2.dtypes[i]
    if t == 'int64' or t == 'float64':
        numInf = np.isinf(df2.iloc[:,i]).sum()
        if numInf > 0:
            print(numInf, "infinite values in column", df2.columns[i])
            df2 = df2[df2[df2.columns[i]] != np.inf]
1509 infinite values in column flow_bps
1358 infinite values in column flow_pps
In [266]:
# Handle missing data

# Drop rows with NaNs
print("Shape before dropping rows with NaNs:", df2.shape)
df2.dropna(inplace = True)
print("Shape after dropping rows with NaNs:", df2.shape)
Shape before dropping rows with NaNs: (2826180, 90)
Shape after dropping rows with NaNs: (2826180, 90)
In [267]:
# Field info
pd.options.display.max_columns = df2.shape[1]
print("Data frame info:")
print(df2.info())
print()

# Categories of categorical variables
print("Label categories:")
print(df2['label'].unique())
print()

# Summary stats
print("Summary stats:")
print(df2.describe(include='all', datetime_is_numeric=True))
print()
Data frame info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2826180 entries, 0 to 692702
Data columns (total 90 columns):
 #   Column               Dtype         
---  ------               -----         
 0   flow_id              object        
 1   srcip                object        
 2   src_port             int64         
 3   dstip                object        
 4   dst_port             int64         
 5   timestamp            datetime64[ns]
 6   flow_duration        int64         
 7   tot_fwd_pkts         int64         
 8   tot_bwd_pkts         int64         
 9   tot_len_of_fwd_pkts  float64       
 10  tot_len_of_bwd_pkts  float64       
 11  fwd_pkt_len_max      float64       
 12  fwd_pkt_len_min      float64       
 13  fwd_pkt_len_mean     float64       
 14  fwd_pkt_len_std      float64       
 15  bwd_pkt_len_max      float64       
 16  bwd_pkt_len_min      float64       
 17  bwd_pkt_len_mean     float64       
 18  bwd_pkt_len_std      float64       
 19  flow_bps             float64       
 20  flow_pps             float64       
 21  flow_iat_mean        float64       
 22  flow_iat_std         float64       
 23  flow_iat_max         float64       
 24  flow_iat_min         float64       
 25  fwd_iat_tot          float64       
 26  fwd_iat_mean         float64       
 27  fwd_iat_std          float64       
 28  fwd_iat_max          float64       
 29  fwd_iat_min          float64       
 30  bwd_iat_tot          float64       
 31  bwd_iat_mean         float64       
 32  bwd_iat_std          float64       
 33  bwd_iat_max          float64       
 34  bwd_iat_min          float64       
 35  fwd_psh_flags        int64         
 36  bwd_psh_flags        int64         
 37  fwd_urg_flags        int64         
 38  bwd_urg_flags        int64         
 39  fwd_hdr_len          int64         
 40  bwd_hdr_len          int64         
 41  fwd_pps              float64       
 42  bwd_pps              float64       
 43  min_pkt_len          float64       
 44  max_pkt_len          float64       
 45  pkt_len_mean         float64       
 46  pkt_len_std          float64       
 47  pkt_len_var          float64       
 48  fin_flag_ct          int64         
 49  syn_flag_ct          int64         
 50  rst_flag_ct          int64         
 51  psh_flag_ct          int64         
 52  ack_flag_ct          int64         
 53  urg_flag_ct          int64         
 54  cwe_flag_ct          int64         
 55  ece_flag_ct          int64         
 56  down_up_ratio        float64       
 57  avg_pkt_sz           float64       
 58  avg_fwd_seg_sz       float64       
 59  avg_bwd_seg_sz       float64       
 60  fwd_hdr_len_1        int64         
 61  fwd_avg_bytes_bulk   int64         
 62  fwd_avg_pkts_bulk    int64         
 63  fwd_avg_bulk_rate    int64         
 64  bwd_avg_bytes_bulk   int64         
 65  bwd_avg_pkts_bulk    int64         
 66  bwd_avg_bulk_rate    int64         
 67  subflow_fwd_pkts     int64         
 68  subflow_fwd_bytes    int64         
 69  subflow_bwd_pkts     int64         
 70  subflow_bwd_bytes    int64         
 71  init_win_bytes_fwd   int64         
 72  init_win_bytes_bwd   int64         
 73  act_data_pkt_fwd     int64         
 74  min_seg_sz_fwd       int64         
 75  active_mean          float64       
 76  active_std           float64       
 77  active_max           float64       
 78  active_min           float64       
 79  idle_mean            float64       
 80  idle_std             float64       
 81  idle_max             float64       
 82  idle_min             float64       
 83  label                object        
 84  fn                   object        
 85  Mon                  uint8         
 86  Thu                  uint8         
 87  Tue                  uint8         
 88  Wed                  uint8         
 89  udp                  uint8         
dtypes: datetime64[ns](1), float64(45), int64(34), object(5), uint8(5)
memory usage: 1.8+ GB
None

Label categories:
['BENIGN' 'DDoS' 'PortScan' 'Bot' 'Infiltration'
 'Web Attack - Brute Force' 'Web Attack - XSS'
 'Web Attack - Sql Injection' 'FTP-Patator' 'SSH-Patator' 'DoS slowloris'
 'DoS Slowhttptest' 'DoS Hulk' 'DoS GoldenEye' 'Heartbleed']

Summary stats:
                                       flow_id       srcip      src_port  \
count                                  2826180     2826180  2.826180e+06   
unique                                 1084509       16990           NaN   
top     192.168.10.255-192.168.10.3-137-137-17  172.16.0.1           NaN   
freq                                       523      558141           NaN   
mean                                       NaN         NaN  4.116262e+04   
min                                        NaN         NaN  1.000000e+00   
25%                                        NaN         NaN  3.283000e+04   
50%                                        NaN         NaN  5.095800e+04   
75%                                        NaN         NaN  5.842500e+04   
max                                        NaN         NaN  6.553500e+04   
std                                        NaN         NaN  2.227550e+04   

               dstip      dst_port                      timestamp  \
count        2826180  2.826180e+06                        2826180   
unique         19041           NaN                            NaN   
top     192.168.10.3           NaN                            NaN   
freq          685128           NaN                            NaN   
mean             NaN  8.066371e+03  2017-05-11 07:42:06.019697152   
min              NaN  1.000000e+00            2017-03-07 01:00:01   
25%              NaN  5.300000e+01            2017-04-07 04:25:00   
50%              NaN  8.000000e+01            2017-05-07 10:48:00   
75%              NaN  4.430000e+02            2017-06-07 12:44:00   
max              NaN  6.553500e+04            2017-07-07 12:59:00   
std              NaN  1.827873e+04                            NaN   

        flow_duration  tot_fwd_pkts  tot_bwd_pkts  tot_len_of_fwd_pkts  \
count    2.826180e+06  2.826180e+06  2.826180e+06         2.826180e+06   
unique            NaN           NaN           NaN                  NaN   
top               NaN           NaN           NaN                  NaN   
freq              NaN           NaN           NaN                  NaN   
mean     1.475845e+07  9.311210e+00  1.041004e+01         5.501821e+02   
min     -1.300000e+01  1.000000e+00  0.000000e+00         0.000000e+00   
25%      1.550000e+02  2.000000e+00  1.000000e+00         1.200000e+01   
50%      3.132900e+04  2.000000e+00  2.000000e+00         6.200000e+01   
75%      3.193939e+06  5.000000e+00  4.000000e+00         1.880000e+02   
max      1.200000e+08  2.197590e+05  2.919220e+05         1.290000e+07   
std      3.361178e+07  7.502717e+02  9.981930e+02         1.000163e+04   

        tot_len_of_bwd_pkts  fwd_pkt_len_max  fwd_pkt_len_min  \
count          2.826180e+06     2.826180e+06     2.826180e+06   
unique                  NaN              NaN              NaN   
top                     NaN              NaN              NaN   
freq                    NaN              NaN              NaN   
mean           1.618874e+04     2.079291e+02     1.874053e+01   
min            0.000000e+00     0.000000e+00     0.000000e+00   
25%            4.000000e+00     6.000000e+00     0.000000e+00   
50%            1.240000e+02     3.700000e+01     2.000000e+00   
75%            4.840000e+02     8.200000e+01     3.600000e+01   
max            6.554530e+08     2.482000e+04     2.325000e+03   
std            2.264914e+06     7.177155e+02     6.037169e+01   

        fwd_pkt_len_mean  fwd_pkt_len_std  bwd_pkt_len_max  bwd_pkt_len_min  \
count       2.826180e+06     2.826180e+06     2.826180e+06     2.826180e+06   
unique               NaN              NaN              NaN              NaN   
top                  NaN              NaN              NaN              NaN   
freq                 NaN              NaN              NaN              NaN   
mean        5.829124e+01     6.901951e+01     8.722535e+02     4.111387e+01   
min         0.000000e+00     0.000000e+00     0.000000e+00     0.000000e+00   
25%         6.000000e+00     0.000000e+00     2.000000e+00     0.000000e+00   
50%         3.400000e+01     0.000000e+00     8.000000e+01     0.000000e+00   
75%         5.000000e+01     2.616295e+01     2.820000e+02     7.700000e+01   
max         5.940857e+03     7.125597e+03     1.953000e+04     2.896000e+03   
std         1.862237e+02     2.814005e+02     1.947624e+03     6.889449e+01   

        bwd_pkt_len_mean  bwd_pkt_len_std      flow_bps      flow_pps  \
count       2.826180e+06     2.826180e+06  2.826180e+06  2.826180e+06   
unique               NaN              NaN           NaN           NaN   
top                  NaN              NaN           NaN           NaN   
freq                 NaN              NaN           NaN           NaN   
mean        3.064413e+02     3.358671e+02  1.492614e+06  7.087142e+04   
min         0.000000e+00     0.000000e+00 -2.610000e+08 -2.000000e+06   
25%         2.000000e+00     0.000000e+00  1.193804e+02  3.456822e+00   
50%         7.200000e+01     0.000000e+00  4.596715e+03  1.099913e+02   
75%         1.813333e+02     7.860195e+01  1.666667e+05  2.325581e+04   
max         5.800500e+03     8.194660e+03  2.071000e+09  4.000000e+06   
std         6.056207e+02     8.402626e+02  2.594791e+07  2.544386e+05   

        flow_iat_mean  flow_iat_std  flow_iat_max  flow_iat_min   fwd_iat_tot  \
count    2.826180e+06  2.826180e+06  2.826180e+06  2.826180e+06  2.826180e+06   
unique            NaN           NaN           NaN           NaN           NaN   
top               NaN           NaN           NaN           NaN           NaN   
freq              NaN           NaN           NaN           NaN           NaN   
mean     1.299647e+06  2.922084e+06  9.188572e+06  1.625879e+05  1.445528e+07   
min     -1.300000e+01  0.000000e+00 -1.300000e+01 -1.400000e+01  0.000000e+00   
25%      6.400000e+01  0.000000e+00  1.240000e+02  3.000000e+00  0.000000e+00   
50%      1.150593e+04  1.378151e+02  3.087100e+04  4.000000e+00  4.500000e+01   
75%      3.368703e+05  6.891644e+05  2.427224e+06  6.400000e+01  1.236743e+06   
max      1.200000e+08  8.480026e+07  1.200000e+08  1.200000e+08  1.200000e+08   
std      4.510501e+06  8.050574e+06  2.447515e+07  2.952431e+06  3.353336e+07   

        fwd_iat_mean   fwd_iat_std   fwd_iat_max   fwd_iat_min   bwd_iat_tot  \
count   2.826180e+06  2.826180e+06  2.826180e+06  2.826180e+06  2.826180e+06   
unique           NaN           NaN           NaN           NaN           NaN   
top              NaN           NaN           NaN           NaN           NaN   
freq             NaN           NaN           NaN           NaN           NaN   
mean    2.613465e+06  3.270332e+06  9.048814e+06  1.023449e+06  9.909463e+06   
min     0.000000e+00  0.000000e+00  0.000000e+00 -1.200000e+01  0.000000e+00   
25%     0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
50%     2.900000e+01  0.000000e+00  4.400000e+01  3.000000e+00  3.000000e+00   
75%     2.058253e+05  6.572025e+04  9.252650e+05  4.800000e+01  9.963325e+04   
max     1.200000e+08  8.460293e+07  1.200000e+08  1.200000e+08  1.200000e+08   
std     9.532443e+06  9.645259e+06  2.454487e+07  8.598004e+06  2.875685e+07   

        bwd_iat_mean   bwd_iat_std   bwd_iat_max   bwd_iat_min  fwd_psh_flags  \
count   2.826180e+06  2.826180e+06  2.826180e+06  2.826180e+06   2.826180e+06   
unique           NaN           NaN           NaN           NaN            NaN   
top              NaN           NaN           NaN           NaN            NaN   
freq             NaN           NaN           NaN           NaN            NaN   
mean    1.808645e+06  1.488292e+06  4.692064e+06  9.688127e+05   4.642379e-02   
min     0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   0.000000e+00   
25%     0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   0.000000e+00   
50%     3.000000e+00  0.000000e+00  3.000000e+00  1.000000e+00   0.000000e+00   
75%     1.843982e+04  1.590471e+04  6.086825e+04  4.500000e+01   0.000000e+00   
max     1.200000e+08  8.441801e+07  1.200000e+08  1.200000e+08   1.000000e+00   
std     8.894058e+06  6.283144e+06  1.717356e+07  8.315596e+06   2.104011e-01   

        bwd_psh_flags  fwd_urg_flags  bwd_urg_flags   fwd_hdr_len  \
count       2826180.0   2.826180e+06      2826180.0  2.826180e+06   
unique            NaN            NaN            NaN           NaN   
top               NaN            NaN            NaN           NaN   
freq              NaN            NaN            NaN           NaN   
mean              0.0   1.114579e-04            0.0 -2.603941e+04   
min               0.0   0.000000e+00            0.0 -3.221223e+10   
25%               0.0   0.000000e+00            0.0  4.000000e+01   
50%               0.0   0.000000e+00            0.0  6.400000e+01   
75%               0.0   0.000000e+00            0.0  1.200000e+02   
max               0.0   1.000000e+00            0.0  4.644908e+06   
std               0.0   1.055677e-02            0.0  2.106985e+07   

         bwd_hdr_len       fwd_pps       bwd_pps   min_pkt_len   max_pkt_len  \
count   2.826180e+06  2.826180e+06  2.826180e+06  2.826180e+06  2.826180e+06   
unique           NaN           NaN           NaN           NaN           NaN   
top              NaN           NaN           NaN           NaN           NaN   
freq             NaN           NaN           NaN           NaN           NaN   
mean   -2.276954e+03  6.394313e+04  7.006481e+03  1.645875e+01  9.519301e+02   
min    -1.073741e+09  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
25%     2.000000e+01  1.774760e+00  1.270092e-01  0.000000e+00  6.000000e+00   
50%     4.000000e+01  6.168270e+01  1.990604e+01  2.000000e+00  8.800000e+01   
75%     1.040000e+02  1.204819e+04  7.380074e+03  3.600000e+01  5.330000e+02   
max     5.838440e+06  3.000000e+06  2.000000e+06  1.448000e+03  2.482000e+04   
std     1.453381e+06  2.476743e+05  3.818145e+04  2.525050e+01  2.029509e+03   

        pkt_len_mean   pkt_len_std   pkt_len_var   fin_flag_ct   syn_flag_ct  \
count   2.826180e+06  2.826180e+06  2.826180e+06  2.826180e+06  2.826180e+06   
unique           NaN           NaN           NaN           NaN           NaN   
top              NaN           NaN           NaN           NaN           NaN   
freq             NaN           NaN           NaN           NaN           NaN   
mean    1.722170e+02  2.954492e+02  4.869389e+05  3.530384e-02  4.642379e-02   
min     0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
25%     6.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
50%     5.733333e+01  2.629068e+01  6.912000e+02  0.000000e+00  0.000000e+00   
75%     1.200000e+02  1.756193e+02  3.084213e+04  0.000000e+00  0.000000e+00   
max     3.337143e+03  4.731522e+03  2.240000e+07  1.000000e+00  1.000000e+00   
std     3.056611e+02  6.321992e+02  1.648703e+06  1.845467e-01  2.104011e-01   

         rst_flag_ct   psh_flag_ct   ack_flag_ct   urg_flag_ct   cwe_flag_ct  \
count   2.826180e+06  2.826180e+06  2.826180e+06  2.826180e+06  2.826180e+06   
unique           NaN           NaN           NaN           NaN           NaN   
top              NaN           NaN           NaN           NaN           NaN   
freq             NaN           NaN           NaN           NaN           NaN   
mean    2.427305e-04  2.984626e-01  3.155040e-01  9.488638e-02  1.114579e-04   
min     0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
25%     0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
50%     0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
75%     0.000000e+00  1.000000e+00  1.000000e+00  0.000000e+00  0.000000e+00   
max     1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+00   
std     1.557792e-02  4.575836e-01  4.647164e-01  2.930580e-01  1.055677e-02   

         ece_flag_ct  down_up_ratio    avg_pkt_sz  avg_fwd_seg_sz  \
count   2.826180e+06   2.826180e+06  2.826180e+06    2.826180e+06   
unique           NaN            NaN           NaN             NaN   
top              NaN            NaN           NaN             NaN   
freq             NaN            NaN           NaN             NaN   
mean    2.437920e-04   6.842268e-01  1.922860e+02    5.829124e+01   
min     0.000000e+00   0.000000e+00  0.000000e+00    0.000000e+00   
25%     0.000000e+00   0.000000e+00  7.750000e+00    6.000000e+00   
50%     0.000000e+00   1.000000e+00  7.250000e+01    3.400000e+01   
75%     0.000000e+00   1.000000e+00  1.495000e+02    5.000000e+01   
max     1.000000e+00   1.560000e+02  3.893333e+03    5.940857e+03   
std     1.561194e-02   6.805770e-01  3.320397e+02    1.862237e+02   

        avg_bwd_seg_sz  fwd_hdr_len_1  fwd_avg_bytes_bulk  fwd_avg_pkts_bulk  \
count     2.826180e+06   2.826180e+06           2826180.0          2826180.0   
unique             NaN            NaN                 NaN                NaN   
top                NaN            NaN                 NaN                NaN   
freq               NaN            NaN                 NaN                NaN   
mean      3.064413e+02  -2.603941e+04                 0.0                0.0   
min       0.000000e+00  -3.221223e+10                 0.0                0.0   
25%       2.000000e+00   4.000000e+01                 0.0                0.0   
50%       7.200000e+01   6.400000e+01                 0.0                0.0   
75%       1.813333e+02   1.200000e+02                 0.0                0.0   
max       5.800500e+03   4.644908e+06                 0.0                0.0   
std       6.056207e+02   2.106985e+07                 0.0                0.0   

        fwd_avg_bulk_rate  bwd_avg_bytes_bulk  bwd_avg_pkts_bulk  \
count           2826180.0           2826180.0          2826180.0   
unique                NaN                 NaN                NaN   
top                   NaN                 NaN                NaN   
freq                  NaN                 NaN                NaN   
mean                  0.0                 0.0                0.0   
min                   0.0                 0.0                0.0   
25%                   0.0                 0.0                0.0   
50%                   0.0                 0.0                0.0   
75%                   0.0                 0.0                0.0   
max                   0.0                 0.0                0.0   
std                   0.0                 0.0                0.0   

        bwd_avg_bulk_rate  subflow_fwd_pkts  subflow_fwd_bytes  \
count           2826180.0      2.826180e+06       2.826180e+06   
unique                NaN               NaN                NaN   
top                   NaN               NaN                NaN   
freq                  NaN               NaN                NaN   
mean                  0.0      9.311210e+00       5.501716e+02   
min                   0.0      1.000000e+00       0.000000e+00   
25%                   0.0      2.000000e+00       1.200000e+01   
50%                   0.0      2.000000e+00       6.200000e+01   
75%                   0.0      5.000000e+00       1.880000e+02   
max                   0.0      2.197590e+05       1.287034e+07   
std                   0.0      7.502717e+02       9.988100e+03   

        subflow_bwd_pkts  subflow_bwd_bytes  init_win_bytes_fwd  \
count       2.826180e+06       2.826180e+06        2.826180e+06   
unique               NaN                NaN                 NaN   
top                  NaN                NaN                 NaN   
freq                 NaN                NaN                 NaN   
mean        1.041004e+01       1.618840e+04        6.996586e+03   
min         0.000000e+00       0.000000e+00       -1.000000e+00   
25%         1.000000e+00       4.000000e+00       -1.000000e+00   
50%         2.000000e+00       1.240000e+02        2.510000e+02   
75%         4.000000e+00       4.840000e+02        8.192000e+03   
max         2.919220e+05       6.554530e+08        6.553500e+04   
std         9.981930e+02       2.264883e+06        1.434350e+04   

        init_win_bytes_bwd  act_data_pkt_fwd  min_seg_sz_fwd   active_mean  \
count         2.826180e+06      2.826180e+06    2.826180e+06  2.826180e+06   
unique                 NaN               NaN             NaN           NaN   
top                    NaN               NaN             NaN           NaN   
freq                   NaN               NaN             NaN           NaN   
mean          1.989484e+03      5.426774e+00   -2.746141e+03  7.918279e+04   
min          -1.000000e+00      0.000000e+00   -5.368707e+08  0.000000e+00   
25%          -1.000000e+00      0.000000e+00    2.000000e+01  0.000000e+00   
50%          -1.000000e+00      1.000000e+00    2.400000e+01  0.000000e+00   
75%           2.350000e+02      2.000000e+00    3.200000e+01  0.000000e+00   
max           6.553500e+04      2.135570e+05    1.380000e+02  1.100000e+08   
std           8.456933e+03      6.369392e+02    1.085865e+06  6.349001e+05   

          active_std    active_max    active_min     idle_mean      idle_std  \
count   2.826180e+06  2.826180e+06  2.826180e+06  2.826180e+06  2.826180e+06   
unique           NaN           NaN           NaN           NaN           NaN   
top              NaN           NaN           NaN           NaN           NaN   
freq             NaN           NaN           NaN           NaN           NaN   
mean    3.867159e+04  1.466043e+05  5.820398e+04  8.323819e+06  5.028549e+05   
min     0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
25%     0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
50%     0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
75%     0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00  0.000000e+00   
max     7.420000e+07  1.100000e+08  1.100000e+08  1.200000e+08  7.690000e+07   
std     3.688951e+05  9.599700e+05  5.767597e+05  2.364610e+07  4.605474e+06   

            idle_max      idle_min    label  \
count   2.826180e+06  2.826180e+06  2826180   
unique           NaN           NaN       15   
top              NaN           NaN   BENIGN   
freq             NaN           NaN  2269630   
mean    8.701092e+06  7.928897e+06      NaN   
min     0.000000e+00  0.000000e+00      NaN   
25%     0.000000e+00  0.000000e+00      NaN   
50%     0.000000e+00  0.000000e+00      NaN   
75%     0.000000e+00  0.000000e+00      NaN   
max     1.200000e+08  1.200000e+08      NaN   
std     2.438251e+07  2.337941e+07      NaN   

                                          fn           Mon           Thu  \
count                                2826180  2.826180e+06  2.826180e+06   
unique                                     8           NaN           NaN   
top     Wednesday-workingHours.pcap_ISCX.csv           NaN           NaN   
freq                                  691076           NaN           NaN   
mean                                     NaN  1.872365e-01  1.621270e-01   
min                                      NaN  0.000000e+00  0.000000e+00   
25%                                      NaN  0.000000e+00  0.000000e+00   
50%                                      NaN  0.000000e+00  0.000000e+00   
75%                                      NaN  0.000000e+00  0.000000e+00   
max                                      NaN  1.000000e+00  1.000000e+00   
std                                      NaN  3.901013e-01  3.685673e-01   

                 Tue           Wed           udp  
count   2.826180e+06  2.826180e+06  2.826180e+06  
unique           NaN           NaN           NaN  
top              NaN           NaN           NaN  
freq             NaN           NaN           NaN  
mean    1.575728e-01  2.445265e-01  3.536491e-01  
min     0.000000e+00  0.000000e+00  0.000000e+00  
25%     0.000000e+00  0.000000e+00  0.000000e+00  
50%     0.000000e+00  0.000000e+00  0.000000e+00  
75%     0.000000e+00  0.000000e+00  1.000000e+00  
max     1.000000e+00  1.000000e+00  1.000000e+00  
std     3.643400e-01  4.298062e-01  4.781020e-01  

In [330]:
# Group by label
dfgrp = pd.DataFrame(df2.groupby(['label']).size())
dfgrp.columns = ['count']
dfgrp = dfgrp.sort_values(by = ['count'], ascending=False)
display(dfgrp)
dfgrp = dfgrp.sort_values(by = ['count'], ascending=True)

# Plot
#plt.barh(dfgrp.index, dfgrp['count'])
#plt.xscale('log')
#plt.xlabel('Flow count (log scale)')
#plt.ylabel('Flow label')
#plt.show()

# Plot
fig1 = px.bar(dfgrp, y=dfgrp.index, x='count', template='plotly_white', orientation='h', \
    log_x=True, text='count')
fig1.update_traces(marker_color='black')
fig1.update_xaxes(title="Flow count (log scale)")
fig1.update_yaxes(title="Label")
fig1.update_layout(title={'text': 'Network Flow Labels', 'xanchor': 'left', 'yanchor': 'top'})
fig1.show()
count
label
BENIGN 2269630
DoS Hulk 230124
PortScan 158798
DDoS 128025
DoS GoldenEye 10293
FTP-Patator 7935
SSH-Patator 5897
DoS slowloris 5796
DoS Slowhttptest 5499
Bot 1956
Web Attack - Brute Force 1507
Web Attack - XSS 652
Infiltration 36
Web Attack - Sql Injection 21
Heartbleed 11
In [30]:
# Principle component analysis

# Create the PCA instance
pca = PCA()

# Filter out quantitative fields
numFields = []
objFields = []
for i in range(0, df2.shape[1]):
    t = df2.dtypes[i]
    if t == 'int64' or t == 'float64' or t == 'uint8':
        numFields.append(df2.columns[i])
    else:
        objFields.append(df2.columns[i])
dfnum = df2[numFields]
dfobj = df2[objFields]
print("Numeric fields:", dfnum.shape[1])
print("Non-numeric fields:", dfobj.shape[1])

# PCA
pca.fit(dfnum)
Numeric fields: 84
Non-numeric fields: 6
Out[30]:
PCA()
In [31]:
# PCA summary
print('Components:')
print(pca.components_)
print()
print('Explained variance %:')
cumsum = 0
for i in range(0, len(pca.explained_variance_)):
    curr = pca.explained_variance_[i] / pca.explained_variance_.sum()
    cumsum += curr
    print(i, round(curr, 5), round(cumsum, 5))
print()

# Transform data and reapply the index of the original dataframe
dfnum2 = pd.DataFrame(pca.transform(dfnum))
dfnum2.index = dfnum.index
print('Transformed data:')
print(dfnum2)
print()
Components:
[[ 2.61696214e-05 -3.43866949e-05  4.35609305e-01 ... -3.58671939e-10
   1.66598117e-09 -1.50112192e-09]
 [ 9.12254632e-07 -2.06741368e-05  3.08128729e-01 ...  8.57131617e-10
  -2.55968392e-09 -1.17803428e-09]
 [-1.07530591e-06 -2.07959795e-07 -5.99300446e-04 ...  2.87217097e-11
  -2.80834352e-12  2.96412705e-11]
 ...
 [-0.00000000e+00  2.95293897e-19  2.38783378e-20 ... -2.43017458e-17
   1.29172227e-17  1.23155864e-16]
 [-0.00000000e+00 -2.45086785e-18 -1.27634737e-20 ...  7.29672922e-17
  -3.21689481e-17  3.21218716e-17]
 [-0.00000000e+00  5.70672085e-18  4.49849906e-20 ...  1.68401067e-16
  -1.73818556e-16 -9.43754316e-17]]

Explained variance %:
0 0.62969 0.62969
1 0.10835 0.73805
2 0.10552 0.84357
3 0.07992 0.92349
4 0.04028 0.96377
5 0.02 0.98377
6 0.00747 0.99124
7 0.00337 0.99461
8 0.00122 0.99583
9 0.00119 0.99702
10 0.00086 0.99788
11 0.00051 0.99839
12 0.00033 0.99872
13 0.00032 0.99904
14 0.00026 0.9993
15 0.00024 0.99953
16 0.00017 0.9997
17 0.0001 0.9998
18 5e-05 0.99986
19 3e-05 0.99989
20 3e-05 0.99992
21 2e-05 0.99994
22 2e-05 0.99996
23 1e-05 0.99998
24 1e-05 0.99999
25 0.0 0.99999
26 0.0 1.0
27 0.0 1.0
28 0.0 1.0
29 0.0 1.0
30 0.0 1.0
31 0.0 1.0
32 0.0 1.0
33 0.0 1.0
34 0.0 1.0
35 0.0 1.0
36 0.0 1.0
37 0.0 1.0
38 0.0 1.0
39 0.0 1.0
40 0.0 1.0
41 0.0 1.0
42 0.0 1.0
43 0.0 1.0
44 0.0 1.0
45 0.0 1.0
46 0.0 1.0
47 0.0 1.0
48 0.0 1.0
49 0.0 1.0
50 0.0 1.0
51 0.0 1.0
52 0.0 1.0
53 0.0 1.0
54 0.0 1.0
55 0.0 1.0
56 0.0 1.0
57 0.0 1.0
58 0.0 1.0
59 0.0 1.0
60 0.0 1.0
61 0.0 1.0
62 0.0 1.0
63 0.0 1.0
64 0.0 1.0
65 0.0 1.0
66 0.0 1.0
67 0.0 1.0
68 0.0 1.0
69 0.0 1.0
70 0.0 1.0
71 0.0 1.0
72 0.0 1.0
73 0.0 1.0
74 0.0 1.0
75 0.0 1.0
76 0.0 1.0
77 0.0 1.0
78 0.0 1.0
79 0.0 1.0
80 0.0 1.0
81 0.0 1.0
82 0.0 1.0
83 0.0 1.0

Transformed data:
                  0             1             2             3              4   \
0      -3.082614e+07 -2.600130e+06 -31026.723690  2.168173e+06  927177.757188   
1      -3.078692e+07 -2.537748e+06 -30529.167812 -1.723815e+06  943089.495008   
2      -3.078818e+07 -2.539681e+06 -30543.726412 -1.603095e+06  942606.651025   
3      -3.078942e+07 -2.541639e+06 -30558.454647 -1.480881e+06  942108.359429   
4      -3.082614e+07 -2.600130e+06 -31026.723689  2.168173e+06  927177.757146   
...              ...           ...           ...           ...            ...   
692698 -3.073471e+07 -2.537672e+06 -30582.333894 -1.825405e+06  926524.017554   
692699 -3.079927e+07 -2.558037e+06 -30709.461124 -4.576279e+05  937701.770607   
692700 -3.079034e+07 -2.543207e+06 -30632.643529 -1.382680e+06  941683.455907   
692701 -2.914012e+07 -2.549914e+06 -30085.526755 -1.821772e+06  352328.061513   
692702 -3.065616e+07 -2.534094e+06 -30600.016040 -1.828623e+06  897980.716020   

                   5              6              7              8   \
0      -597359.906508 -315107.685702 -167214.497643   63049.482900   
1      -603999.375923 -317603.056859 -168169.331773   63528.812482   
2      -603813.315899 -317547.371612 -168187.354982   63509.326422   
3      -603607.969274 -317473.631010 -168167.692115   63490.050077   
4      -597359.906348 -315107.685900 -167214.497813   63049.482760   
...               ...            ...            ...            ...   
692698 -590219.567875 -295042.606879 -158927.593426   55354.046847   
692699 -601741.508113 -316490.922755 -167767.045162   63645.029743   
692700 -603412.900222 -317369.286943 -168131.369947   63446.471296   
692701 -176764.992258  390685.977544   89448.068108 -163857.045385   
692702 -566956.131378 -263013.184391 -144774.180319   43886.783758   

                   9              10             11             12  \
0      -183554.961448 -114726.927068  -38794.851954 -285609.648188   
1      -183432.181049 -114536.291890  -38308.244617 -278969.294611   
2      -183422.650826 -114548.530255  -38274.575283 -279169.690004   
3      -183413.130047 -114547.999060  -38273.161663 -279330.556495   
4      -183554.961042 -114726.927298  -38794.851977 -285609.648408   
...               ...            ...            ...            ...   
692698 -161378.662201 -101882.066641  -29780.261229 -276608.397763   
692699 -182617.267876 -114360.529649  -37873.274867 -277324.263269   
692700 -183318.987744 -114498.508108  -38200.707020 -278998.391055   
692701  423978.632044  401907.969947  217889.427121 -189694.874687   
692702 -131056.567836  -84302.985808  -20442.510268 -263918.513540   

                13             14            15            16           17  \
0       796.157289  268732.748862  24118.927713 -54104.679808  4068.387359   
1       725.876417  257226.227439  21672.352036 -51268.178075  3620.751285   
2       727.581612  257519.086399  21713.794187 -51332.778650  3640.966376   
3       729.159011  257806.627685  21772.144880 -51397.308138  3652.392189   
4       796.157320  268732.748668  24118.927435 -54104.679689  4068.386615   
...            ...            ...           ...           ...          ...   
692698  692.997950  255443.071442  25623.603556 -49605.924898  2803.187440   
692699  666.475813  252218.135863  19576.959448 -51119.558909  3491.806019   
692700  721.050138  257346.188328  21675.090318 -51309.979657  3632.644709   
692701  947.039894  258434.056360  76549.584032 -45856.250547 -8171.248630   
692702  665.491873  253565.570015  33422.462944 -45875.956062  -925.684620   

                  18            19           20            21            22  \
0      -36316.268157  33874.505726 -1254.149841  32486.382920 -22812.714268   
1      -28766.887376  22256.100802  -887.372735  15216.138484  -8382.249218   
2      -28959.106508  22530.790456  -896.098237  15622.345379  -8719.826041   
3      -29141.659493  22807.398810  -904.852157  16025.544519  -9065.794771   
4      -36316.267135  33874.506612 -1254.149815  32486.381909 -22812.715922   
...              ...           ...          ...           ...           ...   
692698 -27544.634193  21803.387752  -849.693790  14340.437967  -7876.361425   
692699 -28473.813526  21984.667992  -872.558148  14803.854427  -8299.888859   
692700 -28939.759290  22532.155070  -905.632379  15635.301393  -8733.198561   
692701   9891.740561  20095.609942  -678.967493  16177.751897  -2380.695424   
692702 -29203.793363  19704.260841  -785.843396   9534.503352    467.908362   

                   23            24          25            26          27  \
0       814400.110895  24202.821134   23.516128   -787.894790 -143.032405   
1       -95274.003495  -9403.731249   58.371118  -5619.975954 -478.363784   
2       -74072.335713  -8624.908826   57.388621  -5521.036313 -472.577684   
3       -52706.018508  -7835.522873   56.286825  -5394.286004 -456.734676   
4       814400.091766  24202.821267   23.516705   -787.893658 -143.029837   
...               ...           ...         ...           ...         ...   
692698 -115417.912793 -10005.990042   28.611710  -5466.678039 -358.931197   
692699 -106379.866528  -9779.369495   56.654406  -5414.507524 -397.587601   
692700  -72591.718831  -8566.068969   56.672261  -5504.409397 -473.872726   
692701 -111085.036691  -2534.159586 -230.644589 -13455.495533  154.990467   
692702 -115151.011473  -8015.275587 -173.624982  -4892.413145 -670.356372   

                 28            29          30            31           32  \
0        -60.375503 -26020.218741  700.261217  47653.217033  -703.894915   
1       -335.700061   6541.366673  950.877370  60421.542259 -2066.141307   
2       -332.140161  18335.089743  921.370001  59341.896558 -2106.227525   
3       -335.361669  29849.718014  815.868216  52811.241337 -2733.418388   
4        -60.382665 -26020.312220  700.249521  47652.121314  -705.001469   
...             ...           ...         ...           ...          ...   
692698  -422.563009  -8906.228748   72.004663 -11657.866394 -7068.426678   
692699  -347.231209   -537.757635  294.877444   9373.474620 -6378.039151   
692700  -320.124030   9812.320896  953.803471  61210.438070  -933.753347   
692701 -5978.237629  -8663.509161   23.709275 -11764.495762 -7460.298591   
692702  1216.400717  -9218.175041   64.637849 -17113.967887 -7282.425115   

                 33            34           35          36          37  \
0       -229.996290 -11395.685417 -5906.893636 -256.140904  446.891726   
1       -274.243874 -12505.302465 -7964.232357 -262.833415  434.395426   
2       -247.048637 -12320.544923 -7976.997489  -41.817121  439.158407   
3       -268.014559  -5361.782446 -6596.133972  240.756479  322.218433   
4       -229.822393 -11394.143653 -5906.496616 -256.128053  446.879175   
...             ...           ...          ...         ...         ...   
692698  1316.658812    673.708732  -277.635964 -169.279829 -216.245497   
692699   315.356659  16877.727664   -99.315168   17.116463  -43.931489   
692700  -376.835881 -14923.080609 -8721.460593 -205.209606  464.902189   
692701  1506.611033    299.265139  -517.025838 -155.399553 -144.147640   
692702  1676.964973  -3390.196034  -331.782106 -182.393423 -160.919418   

              38          39        40          41        42         43  \
0      -2.054464  177.335554 -0.554643   -4.311132 -0.626252 -31.236908   
1      -3.645981  321.567541  3.045058   34.225732 -1.010377 -48.365763   
2      -3.510025  308.644483  2.813903   31.974331 -1.016327 -48.213846   
3      -3.077930  268.877859  3.215172   37.195267 -0.726933 -42.428628   
4      -2.054305  177.315780 -0.554654   -4.308859 -0.626267 -31.237934   
...          ...         ...       ...         ...       ...        ...   
692698 -1.061866   73.936985  0.773734   26.872176 -1.477868   4.440719   
692699  0.223551  -24.215837 -1.676562 -100.715622  1.220990  39.327565   
692700 -3.318409  292.690468  1.815615   22.875028 -1.397769 -44.807712   
692701 -1.070183   45.374534 -2.449485  -18.209320 -2.220225  14.300152   
692702 -0.434673   25.685410 -0.602453  -14.633451 -0.819497  20.853171   

               44         45         46         47         48        49  \
0       11.446364 -21.782041 -15.444887   3.685494  -6.401468 -0.602381   
1       23.040345 -14.655485  -3.181313  -4.673664   2.405915 -1.402444   
2       22.434304 -15.052943  -3.388271  -3.932894   1.918285 -1.463645   
3       20.679716  -9.545487   0.777307  -5.705747   4.519644 -1.795126   
4       11.445656 -21.780799 -15.442686   3.684524  -6.400082 -0.602283   
...           ...        ...        ...        ...        ...       ...   
692698  14.593138   3.056836  -7.391701   3.763212   6.274682 -1.098914   
692699  42.487167 -34.021920 -67.146537  71.456352 -56.694651  0.913569   
692700  16.231186 -12.224793   9.578812   2.366623  -3.123519 -0.380003   
692701  38.126133  -7.828856 -40.671216  40.953041 -12.550321  1.153550   
692702   8.263276 -19.328316 -37.004556  20.564391  -5.393155 -4.423319   

               50        51        52        53        54        55        56  \
0       -5.258306  1.983675  2.666146 -0.447386 -0.187091 -0.375653 -0.179373   
1       -2.998404  0.010514  0.991102 -0.301459 -0.253833  0.222400  0.137695   
2       -3.093179  0.095410  1.091081 -0.299904 -0.252417  0.205441  0.137584   
3       -1.775515 -0.381835  0.514145  0.115201 -0.205133  0.208945  0.158195   
4       -5.257807  1.983529  2.665911 -0.447314 -0.187073 -0.375652 -0.179373   
...           ...       ...       ...       ...       ...       ...       ...   
692698   4.673736  0.599172  0.103542 -4.419347  0.170463 -0.758975  0.659599   
692699  10.316147 -2.807342 -5.061199 -2.244891  0.118375 -0.002018  0.703739   
692700  -2.485240  1.016058 -3.577248  0.756544 -0.354560 -1.082876  0.737875   
692701  15.158686  0.488574 -2.389666 -7.027204  0.175825 -0.815775  0.680567   
692702  -3.360916  2.978245  5.712059 -7.384871  0.187372 -0.874612  0.701451   

              57        58        59        60        61        62        63  \
0      -0.159337 -0.010789 -0.206134 -0.015029 -0.212392  0.556106  0.067315   
1      -0.027452  0.029384  0.157025  0.211265 -0.354250  0.179832 -0.421925   
2      -0.025103  0.031594  0.156732  0.205389 -0.352844  0.179481 -0.422733   
3       0.028439  0.058633  0.227327  0.239393 -0.356003  0.161794 -0.432011   
4      -0.159325 -0.010784 -0.206123 -0.015014 -0.212394  0.556101  0.067314   
...          ...       ...       ...       ...       ...       ...       ...   
692698 -0.209265 -0.299245 -0.390379 -0.417076 -0.018690 -0.073456 -0.213373   
692699 -0.228623 -0.341790 -0.311261 -0.022064  0.039511 -0.088341  0.021670   
692700 -0.234943 -0.225085 -0.200570  0.013318  1.182761 -0.119136 -0.040145   
692701 -0.232671 -0.307656 -0.347872 -0.257251 -0.044682 -0.079677 -0.181765   
692702 -0.229887 -0.294077 -0.300024 -0.152890 -0.047224 -0.081942 -0.167239   

              64        65        66        67            68            69  \
0       0.057710  0.003527 -0.000749  0.001707  2.481156e-06 -1.925049e-09   
1      -0.075928 -0.003605 -0.000262 -0.001179  4.941021e-06 -1.023797e-09   
2      -0.074153 -0.002988 -0.000271 -0.001219  5.184989e-06 -1.012883e-09   
3      -0.074120 -0.002369 -0.000124 -0.001387  5.282047e-06 -1.103412e-09   
4       0.057711  0.003527 -0.000749  0.001707  2.481117e-06 -1.925055e-09   
...          ...       ...       ...       ...           ...           ...   
692698  0.071992 -0.009118  0.000196 -0.000255  2.529807e-06 -2.284447e-10   
692699  0.023521 -0.000610  0.000052  0.000326 -8.856551e-07 -4.858633e-10   
692700 -0.017637  0.000543  0.000821 -0.002551 -9.834296e-06 -3.998986e-10   
692701  0.054363 -0.008075 -0.000769 -0.000193  3.626718e-06 -5.298656e-10   
692702  0.043292 -0.007425  0.000204 -0.000238  2.877133e-06 -2.007525e-10   

                  70            71            72            73            74  \
0       1.230194e-08  3.516241e-10  2.165487e-09 -1.829363e-10 -2.743268e-10   
1       1.211693e-08  9.431437e-10  2.500739e-09 -1.573568e-10 -2.507369e-10   
2       1.211880e-08  9.259485e-10  2.500205e-09 -1.574829e-10 -2.517139e-10   
3       1.136397e-08  9.044622e-10  2.558699e-09 -1.635154e-10 -2.515165e-10   
4       1.230176e-08  3.516243e-10  2.165501e-09 -1.829381e-10 -2.743263e-10   
...              ...           ...           ...           ...           ...   
692698  1.010741e-08  8.962695e-10  2.650598e-09 -1.784907e-10 -2.431961e-10   
692699  8.410760e-09  7.005964e-10  2.500253e-09 -1.545035e-10 -2.589572e-10   
692700  1.228019e-08  8.877594e-10  2.420016e-09 -1.568342e-10 -2.531723e-10   
692701  9.446566e-09  8.618241e-10  2.411556e-09 -1.909516e-10 -2.266164e-10   
692702  1.041642e-08  8.763309e-10  2.559252e-09 -1.707907e-10 -2.435998e-10   

                  75            76            77            78            79  \
0      -2.834910e-09 -8.015333e-10  1.137580e-10  2.717402e-13 -3.760759e-12   
1      -3.075818e-09 -7.435362e-10  1.641667e-10  2.803643e-13 -3.860477e-12   
2      -3.069963e-09 -7.452518e-10  1.630312e-10  2.710759e-13 -3.839352e-12   
3      -3.037866e-09 -7.462920e-10  1.618906e-10  2.236751e-13 -3.742953e-12   
4      -2.834903e-09 -8.015330e-10  1.137580e-10  2.717123e-13 -3.760692e-12   
...              ...           ...           ...           ...           ...   
692698 -2.992568e-09 -7.376213e-10  1.645831e-10  6.514546e-14 -3.434730e-12   
692699 -3.027277e-09 -7.573691e-10  1.475397e-10  6.614290e-14 -3.361693e-12   
692700 -3.070139e-09 -7.502390e-10  1.602251e-10  3.144676e-13 -3.915861e-12   
692701 -2.944808e-09 -7.028006e-10  1.403926e-10  4.995858e-14 -3.404274e-12   
692702 -3.026402e-09 -7.362897e-10  1.637708e-10  6.496810e-14 -3.443207e-12   

                  80            81            82            83  
0       2.501400e-12  2.089881e-13 -1.689769e-12  3.109255e-13  
1       2.285121e-12  2.180308e-13 -1.906283e-12  3.067186e-13  
2       2.275299e-12  2.230018e-13 -1.904407e-12  3.030477e-13  
3       2.267880e-12  2.261445e-13 -1.878574e-12  2.469024e-13  
4       2.501375e-12  2.089786e-13 -1.689795e-12  3.109170e-13  
...              ...           ...           ...           ...  
692698  2.279951e-12  1.825018e-13 -1.776949e-12  1.020557e-14  
692699  2.295071e-12  1.951663e-13 -1.750706e-12  3.750200e-14  
692700  2.303671e-12  2.260045e-13 -1.877610e-12  3.274979e-13  
692701  2.157672e-12  2.127035e-13 -1.705541e-12  3.548540e-14  
692702  2.269897e-12  1.762781e-13 -1.771518e-12  1.428569e-14  

[2826180 rows x 84 columns]

In [201]:
# KNN modeling prep

# KNN parameters to iterate through
knn_vals = [3, 6, 9, 12]
knn_weights = ['uniform', 'distance']
knn_leaf_sizes = [15, 20]

# Max features to use (5 features explain 96% of variance)
max_feat = 5

# Scoring method to use for model runs
scoring_method = ['f1_macro', 'f1_micro', 'f1_weighted', 'roc_auc_ovr', 'roc_auc_ovr_weighted']
refit_param = 'roc_auc_ovr'

# Cross-validation folds
cv_folds = 5

# The first 14 components explain 99.9% of variability;
# append the first 14 PCA columns to the non-numerical dataframe
df3 = pd.concat([dfobj, dfnum2.iloc[:,0:(max_feat + 1)]], axis=1)

# Get the dependent variable
y = df3['label']

# Drop the dependent variable ("label"), along with other variables that don't make sense to use in the model
x = df3.drop(['label', 'flow_id', 'srcip', 'dstip', 'timestamp', 'fn'], axis=1)

# Change the names of PCA columns to be strings so that python doesn't complain
x.columns = x.columns.astype(str)
for i in range(5, x.shape[1]):
    x.rename({x.columns[i]: 'pca' + x.columns[i]}, axis = 1, inplace = True)

# Split into training and test sets
(x_train, x_test, y_train, y_test) = train_test_split(x, y, test_size=0.2, random_state=777, stratify=y)

# Standardize by subtracting the mean and scaling (i.e. dividing by stdev)
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
In [138]:
from sklearn.metrics import SCORERS
print(sorted(SCORERS.keys()))
print(sklearn.__version__)
['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted', 'max_error', 'mutual_info_score', 'neg_brier_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_gamma_deviance', 'neg_mean_poisson_deviance', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'neg_root_mean_squared_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'rand_score', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'roc_auc_ovo', 'roc_auc_ovo_weighted', 'roc_auc_ovr', 'roc_auc_ovr_weighted', 'top_k_accuracy', 'v_measure_score']
1.0.2
In [194]:
# TEST !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# KNN modeling - this takes a long time
# TEST !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

# Init results array
r_test = []

max_feat_test = 2
knn_vals_test = [3, 9]
knn_weights_test = ['distance']
knn_leaf_sizes_test = [15]
cv_folds_test = 2
#scoring_method_test = 'f1_weighted'
scoring_method_test = ['f1_macro', 'f1_micro', 'f1_weighted', 'roc_auc_ovr', 'roc_auc_ovr_weighted']
#refit_param_test = False
refit_param_test = 'roc_auc_ovr'

# Iterate from 2 to 14 to see how using fewer features improves
# model speed while maintaining performance
for feat in range(2, (max_feat_test + 1)):

    # Start timer
    print("Modeling with", feat, "PCA columns")
    print()
    t1 = time.time()

    # Choose only the first [feat] columns to train with
    x_train2 = x_train[:,0:feat]
    
    # Parameters to vary using grid search
    # (from https://towardsdatascience.com/gridsearchcv-for-beginners-db48a90114ee)
    params = [{'n_neighbors': knn_vals_test, \
               'weights': knn_weights_test, \
               'leaf_size': knn_leaf_sizes_test}]

    # Instantiate knn
    knn = KNeighborsClassifier()

    # Grid search using knn params
    gs_knn = GridSearchCV(knn, param_grid=params, scoring=scoring_method_test, cv=cv_folds_test, verbose=3, \
        refit=refit_param_test)
    gs_knn.fit(x_train2, y_train)
    print()
    
    # Predict malware class on test cases
    print("Predicting malware class on test cases")
    y_pred = gs_knn.predict(x_test[:,0:feat])
    cm = confusion_matrix(y_test, y_pred, labels=np.unique(y_pred))
    cr = classification_report(y_test, y_pred, output_dict=True, labels=np.unique(y_pred))
    print()
    
    # Append resutls to array
    print("Scoring and appending results")
    r_test.append( \
        { \
            'feat': feat, \
            'knn_k': gs_knn.best_params_['n_neighbors'], \
            'knn_weights': gs_knn.best_params_['weights'], \
            'knn_leaf_size': gs_knn.best_params_['leaf_size'], \
            'score': gs_knn.score(x_train2, y_train), \
            'cm': cm.tolist(), \
            'cr': cr, \
            't': (time.time() - t1), \
            'cvr': gs_knn.cv_results_ \
        } \
    )
    print()

print("Done fitting with KNN")
Modeling with 2 PCA columns

Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV 1/2] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.732) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.908) roc_auc_ovr_weighted: (test=0.987) total time= 1.1min
[CV 2/2] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.736) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.901) roc_auc_ovr_weighted: (test=0.987) total time= 1.1min
[CV 1/2] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.742) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.993) total time= 1.1min
[CV 2/2] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.729) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.926) roc_auc_ovr_weighted: (test=0.993) total time= 1.2min

Predicting malware class on test cases

Scoring and appending results

Done fitting with KNN
In [200]:
# TEST !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# Results
# TEST !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

e = r_test[0]
print(e['cvr'])
#print(multilabel_confusion_matrix(y_test, y_pred, labels=np.unique(y_pred)))
#print(confusion_matrix(y_test, y_pred, labels=np.unique(y_pred)))

for e in r_test:
    for k in e.keys():
        print(k, type(e[k]))

fh = open('out.json', 'w')
fh.write(str(r_test))
fh.close()
{'mean_fit_time': array([4.7936244 , 4.49001873]), 'std_fit_time': array([0.01810884, 0.11299288]), 'mean_score_time': array([60.80041277, 64.77530456]), 'std_score_time': array([1.17025292, 2.63501811]), 'param_leaf_size': masked_array(data=[15, 15],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'param_n_neighbors': masked_array(data=[3, 9],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'param_weights': masked_array(data=['distance', 'distance'],
             mask=[False, False],
       fill_value='?',
            dtype=object), 'params': [{'leaf_size': 15, 'n_neighbors': 3, 'weights': 'distance'}, {'leaf_size': 15, 'n_neighbors': 9, 'weights': 'distance'}], 'split0_test_f1_macro': array([0.73207083, 0.74229839]), 'split1_test_f1_macro': array([0.73608658, 0.72889469]), 'mean_test_f1_macro': array([0.7340787 , 0.73559654]), 'std_test_f1_macro': array([0.00200788, 0.00670185]), 'rank_test_f1_macro': array([2, 1]), 'split0_test_f1_micro': array([0.98058156, 0.98109551]), 'split1_test_f1_micro': array([0.98066737, 0.98107162]), 'mean_test_f1_micro': array([0.98062446, 0.98108357]), 'std_test_f1_micro': array([4.29024337e-05, 1.19419145e-05]), 'rank_test_f1_micro': array([2, 1]), 'split0_test_f1_weighted': array([0.98060531, 0.9810909 ]), 'split1_test_f1_weighted': array([0.980695 , 0.9810743]), 'mean_test_f1_weighted': array([0.98065016, 0.9810826 ]), 'std_test_f1_weighted': array([4.48413417e-05, 8.30130612e-06]), 'rank_test_f1_weighted': array([2, 1]), 'split0_test_roc_auc_ovr': array([0.90764561, 0.93509814]), 'split1_test_roc_auc_ovr': array([0.90137343, 0.92600372]), 'mean_test_roc_auc_ovr': array([0.90450952, 0.93055093]), 'std_test_roc_auc_ovr': array([0.00313609, 0.00454721]), 'rank_test_roc_auc_ovr': array([2, 1]), 'split0_test_roc_auc_ovr_weighted': array([0.98703227, 0.99250174]), 'split1_test_roc_auc_ovr_weighted': array([0.98716332, 0.99254448]), 'mean_test_roc_auc_ovr_weighted': array([0.98709779, 0.99252311]), 'std_test_roc_auc_ovr_weighted': array([6.55263003e-05, 2.13718427e-05]), 'rank_test_roc_auc_ovr_weighted': array([2, 1])}
feat <class 'int'>
knn_k <class 'int'>
knn_weights <class 'str'>
knn_leaf_size <class 'int'>
score <class 'numpy.float64'>
cm <class 'list'>
cr <class 'dict'>
t <class 'float'>
cvr <class 'dict'>
In [202]:
# KNN modeling - this takes a long time

# Init results array
r = []

# Iterate from 2 to max_feat to see how using fewer features improves
# model speed while maintaining performance
for feat in range(2, (max_feat + 1)):

    # Start timer
    print("Modeling with", feat, "PCA columns")
    print()
    t1 = time.time()

    # Choose only the first [feat] columns to train with
    x_train2 = x_train[:,0:feat]
    
    # Parameters to vary using grid search
    # (from https://towardsdatascience.com/gridsearchcv-for-beginners-db48a90114ee)
    params = [{'n_neighbors': knn_vals, \
               'weights': knn_weights, \
               'leaf_size': knn_leaf_sizes}]

    # Instantiate knn
    knn = KNeighborsClassifier()

    # Grid search using knn params
    gs_knn = GridSearchCV(knn, param_grid=params, scoring=scoring_method, cv=cv_folds, verbose=3, refit=refit_param)
    gs_knn.fit(x_train2, y_train)
    print()
    
    # Predict malware class on test cases
    print("Predicting malware class on test cases")
    y_pred = gs_knn.predict(x_test[:,0:feat])
    cm = confusion_matrix(y_test, y_pred, labels=np.unique(y_pred))
    cr = classification_report(y_test, y_pred, output_dict=True, labels=np.unique(y_pred))
    print()
    
    # Append resutls to array
    print("Scoring and appending results")
    r.append( \
        { \
            'feat': feat, \
            'knn_k': gs_knn.best_params_['n_neighbors'], \
            'knn_weights': gs_knn.best_params_['weights'], \
            'knn_leaf_size': gs_knn.best_params_['leaf_size'], \
            'score': gs_knn.score(x_train2, y_train), \
            'cm': cm.tolist(), \
            'cr': cr, \
            't': (time.time() - t1), \
            'cvr': gs_knn.cv_results_ \
        } \
    )
    print()
    fh = open('out.json', 'w')
    fh.write(str(r))
    fh.close()

print("Done fitting with KNN")
Modeling with 2 PCA columns

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.716) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.917) roc_auc_ovr_weighted: (test=0.988) total time=  34.9s
[CV 2/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.721) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.923) roc_auc_ovr_weighted: (test=0.988) total time=  35.2s
[CV 3/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.749) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.925) roc_auc_ovr_weighted: (test=0.988) total time=  34.3s
[CV 4/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.675) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.884) roc_auc_ovr_weighted: (test=0.988) total time=  33.9s
[CV 5/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.745) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.920) roc_auc_ovr_weighted: (test=0.988) total time=  33.7s
[CV 1/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.767) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.917) roc_auc_ovr_weighted: (test=0.988) total time=  28.8s
[CV 2/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.726) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.923) roc_auc_ovr_weighted: (test=0.988) total time=  28.8s
[CV 3/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.753) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.925) roc_auc_ovr_weighted: (test=0.988) total time=  28.8s
[CV 4/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.725) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.884) roc_auc_ovr_weighted: (test=0.988) total time=  30.7s
[CV 5/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.743) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.920) roc_auc_ovr_weighted: (test=0.988) total time=  30.1s
[CV 1/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.703) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.929) roc_auc_ovr_weighted: (test=0.992) total time=  35.2s
[CV 2/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.713) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.992) total time=  35.2s
[CV 3/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.709) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.992) total time=  35.8s
[CV 4/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.660) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.908) roc_auc_ovr_weighted: (test=0.992) total time=  35.7s
[CV 5/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.723) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.992) total time=  34.3s
[CV 1/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.774) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.929) roc_auc_ovr_weighted: (test=0.992) total time=  30.8s
[CV 2/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.728) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.992) total time=  35.1s
[CV 3/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.731) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.992) total time=  32.2s
[CV 4/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.701) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.908) roc_auc_ovr_weighted: (test=0.992) total time=  31.2s
[CV 5/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.744) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.992) total time=  31.6s
[CV 1/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.707) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.993) total time=  38.5s
[CV 2/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.659) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.951) roc_auc_ovr_weighted: (test=0.993) total time=  37.1s
[CV 3/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.651) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.938) roc_auc_ovr_weighted: (test=0.993) total time=  35.9s
[CV 4/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.654) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.912) roc_auc_ovr_weighted: (test=0.993) total time=  37.0s
[CV 5/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.724) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.993) total time=  36.6s
[CV 1/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.772) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.993) total time=  36.1s
[CV 2/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.727) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.951) roc_auc_ovr_weighted: (test=0.993) total time=  34.8s
[CV 3/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.728) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.938) roc_auc_ovr_weighted: (test=0.993) total time=  32.3s
[CV 4/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.701) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.912) roc_auc_ovr_weighted: (test=0.993) total time=  30.4s
[CV 5/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.742) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.993) total time=  31.6s
[CV 1/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.646) f1_micro: (test=0.980) f1_weighted: (test=0.979) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.994) total time=  37.1s
[CV 2/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.645) f1_micro: (test=0.979) f1_weighted: (test=0.979) roc_auc_ovr: (test=0.952) roc_auc_ovr_weighted: (test=0.994) total time=  36.3s
[CV 3/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.639) f1_micro: (test=0.980) f1_weighted: (test=0.979) roc_auc_ovr: (test=0.940) roc_auc_ovr_weighted: (test=0.994) total time=  36.0s
[CV 4/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.638) f1_micro: (test=0.980) f1_weighted: (test=0.979) roc_auc_ovr: (test=0.914) roc_auc_ovr_weighted: (test=0.994) total time=  37.2s
[CV 5/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.641) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.943) roc_auc_ovr_weighted: (test=0.994) total time=  35.6s
[CV 1/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.751) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.994) total time=  31.0s
[CV 2/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.741) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.952) roc_auc_ovr_weighted: (test=0.994) total time=  31.8s
[CV 3/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.728) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.940) roc_auc_ovr_weighted: (test=0.994) total time=  34.6s
[CV 4/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.699) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.914) roc_auc_ovr_weighted: (test=0.994) total time=  31.8s
[CV 5/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.741) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.994) total time=  30.5s
[CV 1/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.716) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.917) roc_auc_ovr_weighted: (test=0.988) total time=  37.3s
[CV 2/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.721) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.923) roc_auc_ovr_weighted: (test=0.988) total time=  34.4s
[CV 3/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.749) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.925) roc_auc_ovr_weighted: (test=0.988) total time=  34.0s
[CV 4/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.675) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.884) roc_auc_ovr_weighted: (test=0.988) total time=  35.3s
[CV 5/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.745) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.920) roc_auc_ovr_weighted: (test=0.988) total time=  35.5s
[CV 1/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.767) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.917) roc_auc_ovr_weighted: (test=0.988) total time=  29.1s
[CV 2/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.726) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.923) roc_auc_ovr_weighted: (test=0.988) total time=  29.3s
[CV 3/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.753) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.925) roc_auc_ovr_weighted: (test=0.988) total time=  30.8s
[CV 4/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.725) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.884) roc_auc_ovr_weighted: (test=0.988) total time=  29.0s
[CV 5/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.743) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.920) roc_auc_ovr_weighted: (test=0.988) total time=  28.3s
[CV 1/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.703) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.929) roc_auc_ovr_weighted: (test=0.992) total time=  34.8s
[CV 2/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.713) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.992) total time=  38.0s
[CV 3/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.709) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.992) total time=  35.8s
[CV 4/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.660) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.908) roc_auc_ovr_weighted: (test=0.992) total time=  34.7s
[CV 5/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.723) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.992) total time=  36.4s
[CV 1/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.774) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.929) roc_auc_ovr_weighted: (test=0.992) total time=  29.8s
[CV 2/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.728) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.992) total time=  31.4s
[CV 3/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.731) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.992) total time=  29.6s
[CV 4/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.701) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.908) roc_auc_ovr_weighted: (test=0.992) total time=  30.3s
[CV 5/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.744) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.992) total time=  30.5s
[CV 1/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.707) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.993) total time=  35.2s
[CV 2/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.659) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.951) roc_auc_ovr_weighted: (test=0.993) total time=  35.1s
[CV 3/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.651) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.938) roc_auc_ovr_weighted: (test=0.993) total time=  37.0s
[CV 4/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.654) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.912) roc_auc_ovr_weighted: (test=0.993) total time=  36.4s
[CV 5/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.724) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.993) total time=  34.7s
[CV 1/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.772) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.993) total time=  30.9s
[CV 2/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.727) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.951) roc_auc_ovr_weighted: (test=0.993) total time=  31.0s
[CV 3/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.728) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.938) roc_auc_ovr_weighted: (test=0.993) total time=  30.3s
[CV 4/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.701) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.912) roc_auc_ovr_weighted: (test=0.993) total time=  30.2s
[CV 5/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.742) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.993) total time=  30.6s
[CV 1/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.646) f1_micro: (test=0.980) f1_weighted: (test=0.979) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.994) total time=  37.3s
[CV 2/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.645) f1_micro: (test=0.979) f1_weighted: (test=0.979) roc_auc_ovr: (test=0.952) roc_auc_ovr_weighted: (test=0.994) total time=  35.9s
[CV 3/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.639) f1_micro: (test=0.980) f1_weighted: (test=0.979) roc_auc_ovr: (test=0.940) roc_auc_ovr_weighted: (test=0.994) total time=  36.3s
[CV 4/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.638) f1_micro: (test=0.980) f1_weighted: (test=0.979) roc_auc_ovr: (test=0.914) roc_auc_ovr_weighted: (test=0.994) total time=  38.0s
[CV 5/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.641) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.943) roc_auc_ovr_weighted: (test=0.994) total time=  35.8s
[CV 1/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.751) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.994) total time=  30.7s
[CV 2/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.741) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.952) roc_auc_ovr_weighted: (test=0.994) total time=  32.3s
[CV 3/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.728) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.940) roc_auc_ovr_weighted: (test=0.994) total time=  32.1s
[CV 4/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.699) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.914) roc_auc_ovr_weighted: (test=0.994) total time=  31.0s
[CV 5/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.741) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.994) total time=  30.2s

Predicting malware class on test cases

Scoring and appending results

Modeling with 3 PCA columns

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.718) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.918) roc_auc_ovr_weighted: (test=0.990) total time=  37.2s
[CV 2/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.723) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.925) roc_auc_ovr_weighted: (test=0.989) total time=  38.5s
[CV 3/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.752) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.926) roc_auc_ovr_weighted: (test=0.990) total time=  37.6s
[CV 4/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.677) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.885) roc_auc_ovr_weighted: (test=0.990) total time=  36.9s
[CV 5/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.747) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.922) roc_auc_ovr_weighted: (test=0.989) total time=  37.8s
[CV 1/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.771) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.918) roc_auc_ovr_weighted: (test=0.990) total time=  32.5s
[CV 2/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.729) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.925) roc_auc_ovr_weighted: (test=0.989) total time=  31.7s
[CV 3/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.757) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.926) roc_auc_ovr_weighted: (test=0.990) total time=  31.7s
[CV 4/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.729) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.885) roc_auc_ovr_weighted: (test=0.990) total time=  32.7s
[CV 5/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.747) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.922) roc_auc_ovr_weighted: (test=0.989) total time=  31.5s
[CV 1/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.705) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.930) roc_auc_ovr_weighted: (test=0.993) total time=  37.9s
[CV 2/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.716) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.993) total time=  39.5s
[CV 3/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.711) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.993) total time=  39.2s
[CV 4/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.661) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.909) roc_auc_ovr_weighted: (test=0.993) total time=  38.0s
[CV 5/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.725) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.933) roc_auc_ovr_weighted: (test=0.993) total time=  38.7s
[CV 1/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.778) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.930) roc_auc_ovr_weighted: (test=0.993) total time=  34.1s
[CV 2/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.731) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.993) total time=  33.0s
[CV 3/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.734) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.993) total time=  33.1s
[CV 4/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.705) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.909) roc_auc_ovr_weighted: (test=0.993) total time=  34.1s
[CV 5/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.747) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.933) roc_auc_ovr_weighted: (test=0.993) total time=  32.3s
[CV 1/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.709) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.933) roc_auc_ovr_weighted: (test=0.994) total time=  38.7s
[CV 2/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.662) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.952) roc_auc_ovr_weighted: (test=0.994) total time=  40.2s
[CV 3/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.654) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.994) total time=  39.0s
[CV 4/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.656) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.913) roc_auc_ovr_weighted: (test=0.994) total time=  39.1s
[CV 5/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.726) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.942) roc_auc_ovr_weighted: (test=0.994) total time=  39.8s
[CV 1/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.775) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.933) roc_auc_ovr_weighted: (test=0.994) total time=  34.8s
[CV 2/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.730) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.952) roc_auc_ovr_weighted: (test=0.994) total time=  34.1s
[CV 3/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.732) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.994) total time=  34.3s
[CV 4/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.705) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.913) roc_auc_ovr_weighted: (test=0.994) total time=  35.6s
[CV 5/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.745) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.942) roc_auc_ovr_weighted: (test=0.994) total time=  33.6s
[CV 1/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.649) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.994) total time=  39.6s
[CV 2/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.648) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.953) roc_auc_ovr_weighted: (test=0.994) total time=  42.7s
[CV 3/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.641) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.995) total time=  40.5s
[CV 4/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.641) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.915) roc_auc_ovr_weighted: (test=0.995) total time=  39.9s
[CV 5/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.643) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.994) total time=  41.4s
[CV 1/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.755) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.995) total time=  34.9s
[CV 2/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.744) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.953) roc_auc_ovr_weighted: (test=0.995) total time=  34.8s
[CV 3/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.731) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.995) total time=  36.3s
[CV 4/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.702) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.915) roc_auc_ovr_weighted: (test=0.995) total time=  35.9s
[CV 5/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.743) f1_micro: (test=0.984) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.945) roc_auc_ovr_weighted: (test=0.995) total time=  34.6s
[CV 1/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.718) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.918) roc_auc_ovr_weighted: (test=0.990) total time=  37.7s
[CV 2/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.723) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.925) roc_auc_ovr_weighted: (test=0.989) total time=  38.7s
[CV 3/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.752) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.926) roc_auc_ovr_weighted: (test=0.990) total time=  37.1s
[CV 4/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.677) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.885) roc_auc_ovr_weighted: (test=0.990) total time=  37.7s
[CV 5/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.747) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.922) roc_auc_ovr_weighted: (test=0.989) total time=  38.4s
[CV 1/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.771) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.918) roc_auc_ovr_weighted: (test=0.990) total time=  31.4s
[CV 2/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.729) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.925) roc_auc_ovr_weighted: (test=0.989) total time=  31.6s
[CV 3/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.757) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.926) roc_auc_ovr_weighted: (test=0.990) total time=  33.1s
[CV 4/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.729) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.885) roc_auc_ovr_weighted: (test=0.990) total time=  32.9s
[CV 5/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.747) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.922) roc_auc_ovr_weighted: (test=0.989) total time=  31.1s
[CV 1/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.705) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.930) roc_auc_ovr_weighted: (test=0.993) total time=  38.3s
[CV 2/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.716) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.993) total time=  38.7s
[CV 3/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.711) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.993) total time=  38.0s
[CV 4/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.661) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.909) roc_auc_ovr_weighted: (test=0.993) total time=  38.4s
[CV 5/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.725) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.933) roc_auc_ovr_weighted: (test=0.993) total time=  38.5s
[CV 1/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.778) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.930) roc_auc_ovr_weighted: (test=0.993) total time=  32.5s
[CV 2/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.731) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.993) total time=  33.1s
[CV 3/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.734) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.993) total time=  34.0s
[CV 4/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.705) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.909) roc_auc_ovr_weighted: (test=0.993) total time=  34.1s
[CV 5/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.747) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.933) roc_auc_ovr_weighted: (test=0.993) total time=  32.3s
[CV 1/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.709) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.933) roc_auc_ovr_weighted: (test=0.994) total time=  39.7s
[CV 2/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.662) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.952) roc_auc_ovr_weighted: (test=0.994) total time=  40.8s
[CV 3/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.654) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.994) total time=  38.8s
[CV 4/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.656) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.913) roc_auc_ovr_weighted: (test=0.994) total time=  39.7s
[CV 5/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.726) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.942) roc_auc_ovr_weighted: (test=0.994) total time=  40.5s
[CV 1/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.775) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.933) roc_auc_ovr_weighted: (test=0.994) total time=  34.0s
[CV 2/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.730) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.952) roc_auc_ovr_weighted: (test=0.994) total time=  34.2s
[CV 3/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.732) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.994) total time=  35.8s
[CV 4/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.705) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.913) roc_auc_ovr_weighted: (test=0.994) total time=  34.6s
[CV 5/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.745) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.942) roc_auc_ovr_weighted: (test=0.994) total time=  33.4s
[CV 1/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.649) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.994) total time=  41.1s
[CV 2/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.648) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.953) roc_auc_ovr_weighted: (test=0.994) total time=  41.2s
[CV 3/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.641) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.995) total time=  39.8s
[CV 4/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.641) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.915) roc_auc_ovr_weighted: (test=0.995) total time=  41.9s
[CV 5/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.643) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.994) total time=  40.4s
[CV 1/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.755) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.995) total time=  34.5s
[CV 2/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.744) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.953) roc_auc_ovr_weighted: (test=0.995) total time=  36.0s
[CV 3/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.731) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.995) total time=  36.1s
[CV 4/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.702) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.915) roc_auc_ovr_weighted: (test=0.995) total time=  34.8s
[CV 5/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.743) f1_micro: (test=0.984) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.945) roc_auc_ovr_weighted: (test=0.995) total time=  34.8s

Predicting malware class on test cases

Scoring and appending results

Modeling with 4 PCA columns

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.729) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.923) roc_auc_ovr_weighted: (test=0.994) total time=  40.0s
[CV 2/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.736) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.928) roc_auc_ovr_weighted: (test=0.993) total time=  39.3s
[CV 3/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.763) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.929) roc_auc_ovr_weighted: (test=0.994) total time=  38.6s
[CV 4/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.690) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.889) roc_auc_ovr_weighted: (test=0.994) total time=  39.3s
[CV 5/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.759) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.926) roc_auc_ovr_weighted: (test=0.994) total time=  39.0s
[CV 1/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.782) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.923) roc_auc_ovr_weighted: (test=0.994) total time=  33.2s
[CV 2/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.744) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.928) roc_auc_ovr_weighted: (test=0.993) total time=  33.5s
[CV 3/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.771) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.929) roc_auc_ovr_weighted: (test=0.994) total time=  34.9s
[CV 4/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.742) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.889) roc_auc_ovr_weighted: (test=0.994) total time=  35.0s
[CV 5/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.760) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.926) roc_auc_ovr_weighted: (test=0.994) total time=  33.0s
[CV 1/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.717) f1_micro: (test=0.988) f1_weighted: (test=0.988) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.995) total time=  41.0s
[CV 2/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.730) f1_micro: (test=0.988) f1_weighted: (test=0.988) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.995) total time=  41.6s
[CV 3/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.726) f1_micro: (test=0.988) f1_weighted: (test=0.988) roc_auc_ovr: (test=0.937) roc_auc_ovr_weighted: (test=0.995) total time=  39.7s
[CV 4/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.674) f1_micro: (test=0.988) f1_weighted: (test=0.988) roc_auc_ovr: (test=0.912) roc_auc_ovr_weighted: (test=0.995) total time=  41.3s
[CV 5/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.739) f1_micro: (test=0.988) f1_weighted: (test=0.988) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.995) total time=  40.7s
[CV 1/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.788) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.995) total time=  34.6s
[CV 2/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.744) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.995) total time=  35.7s
[CV 3/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.747) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.937) roc_auc_ovr_weighted: (test=0.996) total time=  35.2s
[CV 4/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.717) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.912) roc_auc_ovr_weighted: (test=0.996) total time=  34.9s
[CV 5/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.759) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.995) total time=  34.4s
[CV 1/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.718) f1_micro: (test=0.987) f1_weighted: (test=0.987) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.996) total time=  41.8s
[CV 2/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.672) f1_micro: (test=0.987) f1_weighted: (test=0.987) roc_auc_ovr: (test=0.954) roc_auc_ovr_weighted: (test=0.996) total time=  41.2s
[CV 3/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.665) f1_micro: (test=0.987) f1_weighted: (test=0.987) roc_auc_ovr: (test=0.940) roc_auc_ovr_weighted: (test=0.996) total time=  41.3s
[CV 4/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.668) f1_micro: (test=0.987) f1_weighted: (test=0.987) roc_auc_ovr: (test=0.916) roc_auc_ovr_weighted: (test=0.996) total time=  42.0s
[CV 5/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.715) f1_micro: (test=0.987) f1_weighted: (test=0.987) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.996) total time=  40.8s
[CV 1/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.786) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.996) total time=  36.0s
[CV 2/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.742) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.954) roc_auc_ovr_weighted: (test=0.996) total time=  37.0s
[CV 3/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.745) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.940) roc_auc_ovr_weighted: (test=0.996) total time=  35.9s
[CV 4/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.716) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.916) roc_auc_ovr_weighted: (test=0.996) total time=  35.9s
[CV 5/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.757) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.996) total time=  36.9s
[CV 1/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.659) f1_micro: (test=0.986) f1_weighted: (test=0.986) roc_auc_ovr: (test=0.937) roc_auc_ovr_weighted: (test=0.996) total time=  42.1s
[CV 2/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.658) f1_micro: (test=0.986) f1_weighted: (test=0.986) roc_auc_ovr: (test=0.955) roc_auc_ovr_weighted: (test=0.996) total time=  42.2s
[CV 3/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.653) f1_micro: (test=0.986) f1_weighted: (test=0.986) roc_auc_ovr: (test=0.942) roc_auc_ovr_weighted: (test=0.996) total time=  44.1s
[CV 4/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.655) f1_micro: (test=0.986) f1_weighted: (test=0.986) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.997) total time=  42.4s
[CV 5/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.657) f1_micro: (test=0.986) f1_weighted: (test=0.986) roc_auc_ovr: (test=0.946) roc_auc_ovr_weighted: (test=0.996) total time=  42.1s
[CV 1/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.764) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.937) roc_auc_ovr_weighted: (test=0.997) total time=  38.6s
[CV 2/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.757) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.955) roc_auc_ovr_weighted: (test=0.996) total time=  37.3s
[CV 3/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.744) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.942) roc_auc_ovr_weighted: (test=0.997) total time=  37.0s
[CV 4/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.712) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.997) total time=  39.0s
[CV 5/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.755) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.946) roc_auc_ovr_weighted: (test=0.997) total time=  37.2s
[CV 1/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.729) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.923) roc_auc_ovr_weighted: (test=0.994) total time=  38.3s
[CV 2/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.736) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.928) roc_auc_ovr_weighted: (test=0.993) total time=  40.6s
[CV 3/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.763) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.929) roc_auc_ovr_weighted: (test=0.994) total time=  39.5s
[CV 4/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.690) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.889) roc_auc_ovr_weighted: (test=0.994) total time=  38.7s
[CV 5/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.759) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.926) roc_auc_ovr_weighted: (test=0.994) total time=  40.0s
[CV 1/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.782) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.923) roc_auc_ovr_weighted: (test=0.994) total time=  34.3s
[CV 2/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.744) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.928) roc_auc_ovr_weighted: (test=0.993) total time=  33.4s
[CV 3/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.771) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.929) roc_auc_ovr_weighted: (test=0.994) total time=  34.0s
[CV 4/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.742) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.889) roc_auc_ovr_weighted: (test=0.994) total time=  34.1s
[CV 5/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.760) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.926) roc_auc_ovr_weighted: (test=0.994) total time=  33.0s
[CV 1/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.717) f1_micro: (test=0.988) f1_weighted: (test=0.988) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.995) total time=  40.0s
[CV 2/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.730) f1_micro: (test=0.988) f1_weighted: (test=0.988) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.995) total time=  42.2s
[CV 3/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.726) f1_micro: (test=0.988) f1_weighted: (test=0.988) roc_auc_ovr: (test=0.937) roc_auc_ovr_weighted: (test=0.995) total time=  40.3s
[CV 4/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.674) f1_micro: (test=0.988) f1_weighted: (test=0.988) roc_auc_ovr: (test=0.912) roc_auc_ovr_weighted: (test=0.995) total time=  40.7s
[CV 5/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.739) f1_micro: (test=0.988) f1_weighted: (test=0.988) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.995) total time=  41.6s
[CV 1/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.788) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.995) total time=  35.0s
[CV 2/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.744) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.995) total time=  34.9s
[CV 3/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.747) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.937) roc_auc_ovr_weighted: (test=0.996) total time=  36.5s
[CV 4/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.717) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.912) roc_auc_ovr_weighted: (test=0.996) total time=  35.4s
[CV 5/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.759) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.995) total time=  34.4s
[CV 1/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.718) f1_micro: (test=0.987) f1_weighted: (test=0.987) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.996) total time=  42.0s
[CV 2/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.672) f1_micro: (test=0.987) f1_weighted: (test=0.987) roc_auc_ovr: (test=0.954) roc_auc_ovr_weighted: (test=0.996) total time=  41.5s
[CV 3/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.665) f1_micro: (test=0.987) f1_weighted: (test=0.987) roc_auc_ovr: (test=0.940) roc_auc_ovr_weighted: (test=0.996) total time=  40.9s
[CV 4/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.668) f1_micro: (test=0.987) f1_weighted: (test=0.987) roc_auc_ovr: (test=0.916) roc_auc_ovr_weighted: (test=0.996) total time=  42.9s
[CV 5/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.715) f1_micro: (test=0.987) f1_weighted: (test=0.987) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.996) total time=  41.5s
[CV 1/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.786) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.996) total time=  35.8s
[CV 2/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.742) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.954) roc_auc_ovr_weighted: (test=0.996) total time=  37.5s
[CV 3/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.745) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.940) roc_auc_ovr_weighted: (test=0.996) total time=  36.9s
[CV 4/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.716) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.916) roc_auc_ovr_weighted: (test=0.996) total time=  36.1s
[CV 5/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.757) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.996) total time=  36.6s
[CV 1/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.659) f1_micro: (test=0.986) f1_weighted: (test=0.986) roc_auc_ovr: (test=0.937) roc_auc_ovr_weighted: (test=0.996) total time=  43.3s
[CV 2/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.658) f1_micro: (test=0.986) f1_weighted: (test=0.986) roc_auc_ovr: (test=0.955) roc_auc_ovr_weighted: (test=0.996) total time=  42.3s
[CV 3/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.653) f1_micro: (test=0.986) f1_weighted: (test=0.986) roc_auc_ovr: (test=0.942) roc_auc_ovr_weighted: (test=0.996) total time=  43.1s
[CV 4/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.655) f1_micro: (test=0.986) f1_weighted: (test=0.986) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.997) total time=  42.0s
[CV 5/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.657) f1_micro: (test=0.986) f1_weighted: (test=0.986) roc_auc_ovr: (test=0.946) roc_auc_ovr_weighted: (test=0.996) total time=  41.7s
[CV 1/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.764) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.937) roc_auc_ovr_weighted: (test=0.997) total time=  38.2s
[CV 2/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.757) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.955) roc_auc_ovr_weighted: (test=0.996) total time=  38.8s
[CV 3/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.744) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.942) roc_auc_ovr_weighted: (test=0.997) total time=  36.9s
[CV 4/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.712) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.997) total time=  38.7s
[CV 5/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.755) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.946) roc_auc_ovr_weighted: (test=0.997) total time=  36.9s

Predicting malware class on test cases

Scoring and appending results

Modeling with 5 PCA columns

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.761) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.924) roc_auc_ovr_weighted: (test=0.996) total time=  42.4s
[CV 2/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.806) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.996) total time=  41.8s
[CV 3/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.787) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.996) total time=  43.1s
[CV 4/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.702) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.905) roc_auc_ovr_weighted: (test=0.996) total time=  42.3s
[CV 5/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.791) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.996) total time=  40.6s
[CV 1/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.810) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.924) roc_auc_ovr_weighted: (test=0.996) total time=  36.3s
[CV 2/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.811) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.996) total time=  35.6s
[CV 3/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.808) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.996) total time=  35.1s
[CV 4/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.741) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.905) roc_auc_ovr_weighted: (test=0.996) total time=  37.0s
[CV 5/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.806) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.996) total time=  36.2s
[CV 1/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.722) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.997) total time=  42.0s
[CV 2/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.748) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.997) total time=  44.2s
[CV 3/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.731) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.997) total time=  43.1s
[CV 4/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.687) f1_micro: (test=0.992) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.914) roc_auc_ovr_weighted: (test=0.997) total time=  42.5s
[CV 5/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.754) f1_micro: (test=0.992) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.957) roc_auc_ovr_weighted: (test=0.997) total time=  44.1s
[CV 1/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.812) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.997) total time=  37.3s
[CV 2/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.757) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.997) total time=  37.4s
[CV 3/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.801) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.997) total time=  38.6s
[CV 4/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.728) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.914) roc_auc_ovr_weighted: (test=0.997) total time=  37.8s
[CV 5/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.790) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.957) roc_auc_ovr_weighted: (test=0.997) total time=  36.8s
[CV 1/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.717) f1_micro: (test=0.991) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.938) roc_auc_ovr_weighted: (test=0.997) total time=  45.5s
[CV 2/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.690) f1_micro: (test=0.991) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.945) roc_auc_ovr_weighted: (test=0.998) total time=  44.6s
[CV 3/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.682) f1_micro: (test=0.991) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.943) roc_auc_ovr_weighted: (test=0.997) total time=  44.0s
[CV 4/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.683) f1_micro: (test=0.991) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.918) roc_auc_ovr_weighted: (test=0.998) total time=  46.2s
[CV 5/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.733) f1_micro: (test=0.991) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.960) roc_auc_ovr_weighted: (test=0.998) total time=  43.2s
[CV 1/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.820) f1_micro: (test=0.993) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.938) roc_auc_ovr_weighted: (test=0.997) total time=  39.0s
[CV 2/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.755) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.945) roc_auc_ovr_weighted: (test=0.998) total time=  39.3s
[CV 3/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.787) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.943) roc_auc_ovr_weighted: (test=0.998) total time=  38.6s
[CV 4/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.725) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.918) roc_auc_ovr_weighted: (test=0.998) total time=  39.5s
[CV 5/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.791) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.960) roc_auc_ovr_weighted: (test=0.998) total time=  38.7s
[CV 1/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.677) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.998) total time=  44.7s
[CV 2/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.679) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.958) roc_auc_ovr_weighted: (test=0.998) total time=  46.4s
[CV 3/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.671) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.998) total time=  46.1s
[CV 4/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.673) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.998) total time=  45.0s
[CV 5/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.676) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.962) roc_auc_ovr_weighted: (test=0.998) total time=  45.7s
[CV 1/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.818) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.998) total time=  39.6s
[CV 2/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.757) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.958) roc_auc_ovr_weighted: (test=0.998) total time=  40.1s
[CV 3/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.787) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.998) total time=  42.2s
[CV 4/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.723) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.998) total time=  40.4s
[CV 5/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.770) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.962) roc_auc_ovr_weighted: (test=0.998) total time=  39.8s
[CV 1/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.761) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.924) roc_auc_ovr_weighted: (test=0.996) total time=  42.8s
[CV 2/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.806) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.996) total time=  41.1s
[CV 3/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.787) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.996) total time=  40.9s
[CV 4/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.702) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.905) roc_auc_ovr_weighted: (test=0.996) total time=  42.9s
[CV 5/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.791) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.996) total time=  40.5s
[CV 1/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.810) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.924) roc_auc_ovr_weighted: (test=0.996) total time=  35.8s
[CV 2/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.811) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.996) total time=  36.5s
[CV 3/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.808) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.996) total time=  35.5s
[CV 4/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.741) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.905) roc_auc_ovr_weighted: (test=0.996) total time=  35.6s
[CV 5/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.806) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.996) total time=  36.6s
[CV 1/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.722) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.997) total time=  43.0s
[CV 2/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.748) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.997) total time=  42.5s
[CV 3/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.731) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.997) total time=  45.0s
[CV 4/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.687) f1_micro: (test=0.992) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.914) roc_auc_ovr_weighted: (test=0.997) total time=  42.9s
[CV 5/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.754) f1_micro: (test=0.992) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.957) roc_auc_ovr_weighted: (test=0.997) total time=  42.3s
[CV 1/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.812) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.997) total time=  37.8s
[CV 2/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.757) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.997) total time=  37.3s
[CV 3/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.801) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.997) total time=  37.0s
[CV 4/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.728) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.914) roc_auc_ovr_weighted: (test=0.997) total time=  39.2s
[CV 5/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.790) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.957) roc_auc_ovr_weighted: (test=0.997) total time=  37.2s
[CV 1/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.717) f1_micro: (test=0.991) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.938) roc_auc_ovr_weighted: (test=0.997) total time=  43.6s
[CV 2/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.690) f1_micro: (test=0.991) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.945) roc_auc_ovr_weighted: (test=0.998) total time=  46.0s
[CV 3/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.682) f1_micro: (test=0.991) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.943) roc_auc_ovr_weighted: (test=0.997) total time=  44.1s
[CV 4/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.683) f1_micro: (test=0.991) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.918) roc_auc_ovr_weighted: (test=0.998) total time=  44.7s
[CV 5/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.733) f1_micro: (test=0.991) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.960) roc_auc_ovr_weighted: (test=0.998) total time=  44.8s
[CV 1/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.820) f1_micro: (test=0.993) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.938) roc_auc_ovr_weighted: (test=0.997) total time=  38.5s
[CV 2/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.755) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.945) roc_auc_ovr_weighted: (test=0.998) total time=  39.7s
[CV 3/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.787) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.943) roc_auc_ovr_weighted: (test=0.998) total time=  40.1s
[CV 4/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.725) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.918) roc_auc_ovr_weighted: (test=0.998) total time=  38.8s
[CV 5/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.791) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.960) roc_auc_ovr_weighted: (test=0.998) total time=  39.0s
[CV 1/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.677) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.998) total time=  45.2s
[CV 2/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.679) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.958) roc_auc_ovr_weighted: (test=0.998) total time=  44.9s
[CV 3/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.671) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.998) total time=  45.9s
[CV 4/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.673) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.998) total time=  45.1s
[CV 5/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.676) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.962) roc_auc_ovr_weighted: (test=0.998) total time=  45.4s
[CV 1/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.818) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.998) total time=  42.9s
[CV 2/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.757) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.958) roc_auc_ovr_weighted: (test=0.998) total time=  40.4s
[CV 3/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.787) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.998) total time=  40.3s
[CV 4/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.723) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.998) total time=  41.8s
[CV 5/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.770) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.962) roc_auc_ovr_weighted: (test=0.998) total time=  39.3s

Predicting malware class on test cases

Scoring and appending results

Done fitting with KNN
In [273]:
# Explore classification report for first run so we know what values we're interested in
e = r[0]

print("PCA features:", e['feat'])
print("knn_k:", e['knn_k'])
print("knn_weights:", e['knn_weights'])
print("knn_leaf_size:", e['knn_leaf_size'])
print("runtime:", e['t'])
print("score:", e['score'])
#print("cross-validation results:", e['cvr'])
for k in e['cr'].keys():
    print(str(k) + ":", e['cr'][k])

# Confusion matrix (true values are in rows, predicted values in columns)
cm = np.array(e['cm'])
print('Confusion matrix:')
print(cm)
print()

# Correct values
cm_cor = cm * np.identity(cm.shape[0])
print('Correct values:')
print(cm_cor)
print()

# Incorrect values
cm_incor = cm - cm_cor
print('Incorrect values:')
print(cm_incor)
print()

# True positives
cm_tp = sum(cm_cor)
print('True positives:')
print(cm_tp)
print()

# False positives
cm_fp = np.sum(cm_incor, axis=0)
print('False positives:')
print(cm_fp)
print(np.sum(cm_fp))
print()

# False negatives
cm_fn = np.sum(cm_incor, axis=1)
print('False negatives:')
print(cm_fn)
print(np.sum(cm_fp))
print()

# True negatives
cm_tn = np.zeros(cm.shape[0])
for i in range(0, cm.shape[0]):
    # Zero out the ith row and ith column
    cm_tmp = cm.copy()
    cm_tmp[i,:] = 0
    cm_tmp[:,i] = 0
    cm_tn[i] = np.sum(cm_tmp)
print('True negatives:')
print(cm_tn)
print(np.sum(cm_tn))
print()
PCA features: 2
knn_k: 12
knn_weights: distance
knn_leaf_size: 15
runtime: 2760.9946868419647
score: 1.0
BENIGN: {'precision': 0.9917280568329077, 'recall': 0.9896547014271049, 'f1-score': 0.9906902943313672, 'support': 453926}
Bot: {'precision': 0.7622641509433963, 'recall': 0.5166240409207161, 'f1-score': 0.6158536585365854, 'support': 391}
DDoS: {'precision': 0.9000755001887505, 'recall': 0.9311853153680921, 'f1-score': 0.9153661579806124, 'support': 25605}
DoS GoldenEye: {'precision': 0.8952520802741067, 'recall': 0.8882952889752307, 'f1-score': 0.8917601170160898, 'support': 2059}
DoS Hulk: {'precision': 0.9583306204831044, 'recall': 0.9594133623030962, 'f1-score': 0.9588716857397234, 'support': 46025}
DoS Slowhttptest: {'precision': 0.9155435759209344, 'recall': 0.9263636363636364, 'f1-score': 0.920921825576141, 'support': 1100}
DoS slowloris: {'precision': 0.9645328719723183, 'recall': 0.9620362381363244, 'f1-score': 0.9632829373650107, 'support': 1159}
FTP-Patator: {'precision': 0.9467005076142132, 'recall': 0.9401386263390044, 'f1-score': 0.943408156813152, 'support': 1587}
Heartbleed: {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 2}
PortScan: {'precision': 0.9918775676608023, 'recall': 0.9958438287153653, 'f1-score': 0.9938567410875612, 'support': 31760}
SSH-Patator: {'precision': 0.9237560192616372, 'recall': 0.9754237288135593, 'f1-score': 0.9488870568837592, 'support': 1180}
Web Attack - Brute Force: {'precision': 0.6740506329113924, 'recall': 0.707641196013289, 'f1-score': 0.6904376012965965, 'support': 301}
Web Attack - XSS: {'precision': 0.3977272727272727, 'recall': 0.2692307692307692, 'f1-score': 0.3211009174311927, 'support': 130}
micro avg: {'precision': 0.9835113120891097, 'recall': 0.9835304524746782, 'f1-score': 0.9835208821887708, 'support': 565225}
macro avg: {'precision': 0.8709106812916028, 'recall': 0.8509115948158608, 'f1-score': 0.8580336269275223, 'support': 565225}
weighted avg: {'precision': 0.9835767736876729, 'recall': 0.9835304524746782, 'f1-score': 0.983519526836341, 'support': 565225}
Confusion matrix:
[[449230     61   2284    186   1615     91     36     83      0    246
      79     13      2]
 [   177    202     11      0      1      0      0      0      0      0
       0      0      0]
 [  1479      2  23843     12    267      0      0      0      0      2
       0      0      0]
 [   200      0     12   1829     16      0      0      1      0      0
       0      1      0]
 [  1508      0    334     15  44157      0      2      0      0      9
       0      0      0]
 [    77      0      1      0      0   1019      3      0      0      0
       0      0      0]
 [    35      0      1      0      1      3   1115      0      0      2
       1      1      0]
 [    94      0      0      0      0      0      0   1492      0      0
       0      1      0]
 [     0      0      0      0      0      0      0      0      2      0
       0      0      0]
 [   109      0      4      1     17      0      0      0      0  31628
       0      0      1]
 [    26      0      0      0      3      0      0      0      0      0
    1151      0      0]
 [    24      0      0      0      0      0      0      0      0      0
      14    213     50]
 [     7      0      0      0      0      0      0      0      0      0
       1     87     35]]

Correct values:
[[4.4923e+05 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00]
 [0.0000e+00 2.0200e+02 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00]
 [0.0000e+00 0.0000e+00 2.3843e+04 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00]
 [0.0000e+00 0.0000e+00 0.0000e+00 1.8290e+03 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00]
 [0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 4.4157e+04 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00]
 [0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 1.0190e+03
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00]
 [0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  1.1150e+03 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00]
 [0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 1.4920e+03 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00]
 [0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 2.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00]
 [0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 3.1628e+04 0.0000e+00 0.0000e+00
  0.0000e+00]
 [0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 1.1510e+03 0.0000e+00
  0.0000e+00]
 [0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 2.1300e+02
  0.0000e+00]
 [0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
  3.5000e+01]]

Incorrect values:
[[0.000e+00 6.100e+01 2.284e+03 1.860e+02 1.615e+03 9.100e+01 3.600e+01
  8.300e+01 0.000e+00 2.460e+02 7.900e+01 1.300e+01 2.000e+00]
 [1.770e+02 0.000e+00 1.100e+01 0.000e+00 1.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00]
 [1.479e+03 2.000e+00 0.000e+00 1.200e+01 2.670e+02 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 2.000e+00 0.000e+00 0.000e+00 0.000e+00]
 [2.000e+02 0.000e+00 1.200e+01 0.000e+00 1.600e+01 0.000e+00 0.000e+00
  1.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00]
 [1.508e+03 0.000e+00 3.340e+02 1.500e+01 0.000e+00 0.000e+00 2.000e+00
  0.000e+00 0.000e+00 9.000e+00 0.000e+00 0.000e+00 0.000e+00]
 [7.700e+01 0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 3.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00]
 [3.500e+01 0.000e+00 1.000e+00 0.000e+00 1.000e+00 3.000e+00 0.000e+00
  0.000e+00 0.000e+00 2.000e+00 1.000e+00 1.000e+00 0.000e+00]
 [9.400e+01 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00]
 [0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00]
 [1.090e+02 0.000e+00 4.000e+00 1.000e+00 1.700e+01 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00]
 [2.600e+01 0.000e+00 0.000e+00 0.000e+00 3.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00]
 [2.400e+01 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 1.400e+01 0.000e+00 5.000e+01]
 [7.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
  0.000e+00 0.000e+00 0.000e+00 1.000e+00 8.700e+01 0.000e+00]]

True positives:
[4.4923e+05 2.0200e+02 2.3843e+04 1.8290e+03 4.4157e+04 1.0190e+03
 1.1150e+03 1.4920e+03 2.0000e+00 3.1628e+04 1.1510e+03 2.1300e+02
 3.5000e+01]

False positives:
[3736.   63. 2647.  214. 1920.   94.   41.   84.    0.  259.   95.  103.
   53.]
9309.0

False negatives:
[4696.  189. 1762.  230. 1868.   81.   44.   95.    0.  132.   29.   88.
   95.]
9309.0

True negatives:
[107563. 564771. 536973. 562952. 517280. 564031. 564025. 563554. 565223.
 533206. 563950. 564821. 565042.]
6773391.0

In [221]:
# Results

# Load scoring.csv
fh = open("scores.csv", "r")
rdr = csv.reader(fh)
ascores = list(rdr)
fh.close()
print(ascores)

ct = 0       #counter
for e in r:
    
    # Append scores
    e['f1_macro_score'] = ascores[ct+1][2]
    e['f1_macro_leaf_size'] = ascores[ct+1][3]
    e['f1_macro_neighbors'] = ascores[ct+1][4]
    e['f1_macro_weights'] = ascores[ct+1][5]
    e['f1_macro_point_label'] = '(k=' + str(e['f1_macro_neighbors']) + ',wt=' + \
        str(e['f1_macro_weights']) + ',sz=' + str(e['f1_macro_leaf_size']) + ')'
    e['roc_auc_ovr_score'] = ascores[ct+1][14]
    e['roc_auc_ovr_leaf_size'] = ascores[ct+1][15]
    e['roc_auc_ovr_neighbors'] = ascores[ct+1][16]
    e['roc_auc_ovr_weights'] = ascores[ct+1][17]
    e['roc_auc_ovr_point_label'] = '(k=' + str(e['roc_auc_ovr_neighbors']) + ',wt=' + \
        str(e['roc_auc_ovr_weights']) + ',sz=' + str(e['roc_auc_ovr_leaf_size']) + ')'

    # Calcs; need to do some calcs to get true positive rate and false positive rate
    cm = np.array(e['cm'])

    # Correct values
    cm_cor = cm * np.identity(cm.shape[0])
    
    # Incorrect values
    cm_incor = cm - cm_cor
    
    # True positives
    cm_tp = cm_cor
    cm_tp_sum = np.sum(cm_tp)
    
    # False positives
    cm_fp = np.sum(cm_incor, axis=0)
    cm_fp_sum = np.sum(cm_fp)
    
    # False negatives
    cm_fn = np.sum(cm_incor, axis=1)
    cm_fn_sum = np.sum(cm_fp)
    
    # True negatives
    cm_tn = np.zeros(cm.shape[0])
    for i in range(0, cm.shape[0]):
        # Zero out the ith row and ith column
        cm_tmp = cm.copy()
        cm_tmp[i,:] = 0
        cm_tmp[:,i] = 0
        cm_tn[i] = np.sum(cm_tmp)
    cm_tn_sum = np.sum(cm_tn)
    
    # True positive rate = recall = sensitivity = TP / (TP + FN)
    e['tpr'] = np.round(cm_tp_sum / (cm_tp_sum + cm_fn_sum), 4)
    
    # False positive rate = FP / (FP + TN)
    e['fpr'] = np.round(cm_fp_sum / (cm_fp_sum + cm_tn_sum), 4)
    
    # True negative rate = specificity = TN / (TN + FP)
    e['tnr'] = round(cm_tn_sum / (cm_tn_sum + cm_fp_sum), 4)
    
    # Create point label for graph
    e['point_label'] = '(feat=' + str(e['feat']) + ', k=' + str(e['knn_k']) + \
        ', leaf=' + str(e['knn_leaf_size']) + ', wt=' + e['knn_weights'] + ')'
    
    # Results
    print("Run #", ct)
    print("\tPCA features:", e['feat'])
    #print("\tscore:", e['score'])
    #print("\tknn_k:", e['knn_k'])
    #print("\tknn_weights:", e['knn_weights'])
    #print("\tknn_leave_size:", e['knn_leaf_size'])
    print("\tf1_macro_score:", e['f1_macro_score'])
    print("\tf1_macro_neighbors:", e['f1_macro_neighbors'])
    print("\tf1_macro_weights:", e['f1_macro_weights'])
    print("\tf1_macro_leaf_size:", e['f1_macro_leaf_size'])
    print("\troc_auc_ovr_score:", e['roc_auc_ovr_score'])
    print("\troc_auc_ovr_neighbors:", e['roc_auc_ovr_neighbors'])
    print("\troc_auc_ovr_weights:", e['roc_auc_ovr_weights'])
    print("\troc_auc_ovr_leaf_size:", e['roc_auc_ovr_leaf_size'])
    print("\truntime (sec):", e['t'])
    print("\ttrue positive rate:", e['tpr'])
    print("\tfalse positive rate:", e['fpr'])
    print("\ttrue negative rate:", e['tnr'])
    print("\tG-Mean:", np.sqrt(e['tpr'] * e['tnr']))
    print()
    #print("\tclassification report:")
    #print()
    #print(e['cr'])
    #print()
    #print("\tconfusion matrix:")
    #print()
    #print(e['cm'])
    #print()
    ct += 1
[['run', 'pca_cols', 'f1_macro_score', 'f1_macro_leaf_size', 'f1_macro_n_neighbors', 'f1_macro_weights', 'f1_micro_score', 'f1_micro_leaf_size', 'f1_micro_n_neighbors', 'f1_micro_weights', 'f1_weighted_score', 'f1_weighted_leaf_size', 'f1_weighted_n_neighbors', 'f1_weighted_weights', 'roc_auc_ovr_score', 'roc_auc_ovr_leaf_size', 'roc_auc_ovr_n_neighbors', 'roc_auc_ovr_weights', 'roc_auc_ovr_weighted_score', 'roc_auc_ovr_weighted_leaf_size', 'roc_auc_ovr_weighted_n_neighbors', 'roc_auc_ovr_weighted_weights'], ['0', '2', '0.774', '15', '6', 'distance', '0.983', '15', '6', 'distance', '0.983', '15', '6', 'distance', '0.952', '15', '12', 'distance', '0.994', '15', '12', 'distance'], ['1', '3', '0.778', '15', '12', 'distance', '0.984', '15', '3', 'distance', '0.984', '15', '3', 'distance', '0.953', '15', '12', 'distance', '0.995', '15', '12', 'distance'], ['2', '4', '0.788', '15', '6', 'distance', '0.99', '15', '3', 'distance', '0.99', '15', '3', 'distance', '0.955', '15', '12', 'distance', '0.997', '15', '12', 'distance'], ['3', '5', '0.82', '15', '9', 'distance', '0.993', '15', '3', 'distance', '0.993', '15', '3', 'distance', '0.962', '15', '12', 'distance', '0.998', '15', '9', 'distance']]
Run # 0
	PCA features: 2
	f1_macro_score: 0.774
	f1_macro_neighbors: 6
	f1_macro_weights: distance
	f1_macro_leaf_size: 15
	roc_auc_ovr_score: 0.952
	roc_auc_ovr_neighbors: 12
	roc_auc_ovr_weights: distance
	roc_auc_ovr_leaf_size: 15
	runtime (sec): 2760.9946868419647
	true positive rate: 0.9835
	false positive rate: 0.0014
	true negative rate: 0.9986
	G-Mean: 0.9910212409428973

Run # 1
	PCA features: 3
	f1_macro_score: 0.778
	f1_macro_neighbors: 12
	f1_macro_weights: distance
	f1_macro_leaf_size: 15
	roc_auc_ovr_score: 0.953
	roc_auc_ovr_neighbors: 12
	roc_auc_ovr_weights: distance
	roc_auc_ovr_leaf_size: 15
	runtime (sec): 3016.4819836616516
	true positive rate: 0.9844
	false positive rate: 0.0013
	true negative rate: 0.9987
	G-Mean: 0.9915242205816256

Run # 2
	PCA features: 4
	f1_macro_score: 0.788
	f1_macro_neighbors: 6
	f1_macro_weights: distance
	f1_macro_leaf_size: 15
	roc_auc_ovr_score: 0.955
	roc_auc_ovr_neighbors: 12
	roc_auc_ovr_weights: distance
	roc_auc_ovr_leaf_size: 15
	runtime (sec): 3179.8600294589996
	true positive rate: 0.9898
	false positive rate: 0.0009
	true negative rate: 0.9991
	G-Mean: 0.9944391283532643

Run # 3
	PCA features: 5
	f1_macro_score: 0.82
	f1_macro_neighbors: 9
	f1_macro_weights: distance
	f1_macro_leaf_size: 15
	roc_auc_ovr_score: 0.962
	roc_auc_ovr_neighbors: 12
	roc_auc_ovr_weights: distance
	roc_auc_ovr_leaf_size: 15
	runtime (sec): 3396.462958574295
	true positive rate: 0.9929
	false positive rate: 0.0006
	true negative rate: 0.9994
	G-Mean: 0.9961446983244954

In [398]:
# Set plotly renderer
pio.renderers.default = 'notebook'

# Read scores, filter just the ones we want
df_scores = pd.read_csv('scores2.csv')
#df_scores['point_label'] = '(k=' + df_scores['n_neighbors'].astype(str) + ',' + 'wt=' + \
#    df_scores['weights'].astype(str) + ',sz=' + df_scores['leaf_size'].astype(str) + ')'
df_scores['point_label'] = 'k=' + df_scores['n_neighbors'].astype(str)
df_scores = df_scores[(df_scores['method'] == 'f1_macro') | (df_scores['method'] == 'roc_auc_ovr')]
display(df_scores)

# Create data frame out of results array
df_result = pd.DataFrame(r)

# Plot scores
#fig1 = px.scatter(df_result, x='feat', y='f1_macro_score', text='f1_macro_point_label')
fig1 = px.scatter(df_scores, x='feat', y='score', text='point_label', facet_col='method', \
    template='plotly_white', facet_col_spacing=0.1)
fig1.update_traces(textposition='top center')
fig1.update_layout(xaxis_range=[1, 6])
fig1.update_layout(yaxis_range=[0.75, 1.0])
fig1.update_xaxes(title="Number of features")
fig1.update_yaxes(title="Score")
fig1.show()

# Plot ROC curve
#roc = px.scatter(df_result, x='fpr', y='tpr', text='point_label')
#roc.update_traces(textposition='bottom right')
#roc.update_layout(showlegend=False)
#roc.show()

# Read scores
df_scores1 = pd.read_csv('scores.csv')
df_scores1['point_label'] = df_scores1['time_min'].astype(str) + ' min (f1=' + df_scores1['f1_macro_score'].astype(str) + ')'
df_scores1 = df_scores1.drop(
    df_scores1.columns.difference(['run', 'feat', 'f1_macro_score', 'time_min', 'point_label']), axis=1)
display(df_scores1)

# Plot runtime
fig2 = px.scatter(df_scores1, x='feat', y='time_min', text='point_label', template='plotly_white')
fig2.update_traces(textposition='top center')
fig2.update_layout(xaxis_range=[1, 6])
fig2.update_layout(yaxis_range=[40, 60])
fig2.update_xaxes(title="Number of features")
fig2.update_yaxes(title="Runtime (min)")
fig2.show()
run feat method score leaf_size n_neighbors weights point_label
0 0 2 f1_macro 0.774 15 6 distance k=6
3 0 2 roc_auc_ovr 0.952 15 12 distance k=12
5 1 3 f1_macro 0.778 15 12 distance k=12
8 1 3 roc_auc_ovr 0.953 15 12 distance k=12
10 2 4 f1_macro 0.788 15 6 distance k=6
13 2 4 roc_auc_ovr 0.955 15 12 distance k=12
15 3 5 f1_macro 0.820 15 9 distance k=9
18 3 5 roc_auc_ovr 0.962 15 12 distance k=12
run feat f1_macro_score time_min point_label
0 0 2 0.774 46 46 min (f1=0.774)
1 1 3 0.778 50 50 min (f1=0.778)
2 2 4 0.788 53 53 min (f1=0.788)
3 3 5 0.820 57 57 min (f1=0.82)
In [399]:
# Confusion matrix for best model
cm = pd.DataFrame(r[3]['cm'])
cm.columns = list(r[3]['cr'].keys())[0:13]
cm.index = list(r[3]['cr'].keys())[0:13]
display(cm)

# Reduced confusion matrix (one vs rest) - consider any malware as the positive class
cmr = np.zeros([2, 2]).astype(int)
cmarray = np.array(r[3]['cm'])
cmr[1, 1] = cmarray[0, 0]                        #TN=actually benign and predicted to be benign
cmr[1, 0] = np.sum(cmarray[0, 1:])               #FP=actually benign but predicted to be malware
cmr[0, 1] = np.sum(cmarray[1:, 0])               #FN=actually malware but predicted to be benign
cmr[0, 0] = np.sum(cmarray) - cmarray[0, 0]      #TP=actually malware and predicted to be malware
cmr = pd.DataFrame(cmr)
cmr.columns = ['Malware', 'Benign']
cmr.index = ['Malware', 'Benign']
display(cmr)
BENIGN Bot DDoS DoS GoldenEye DoS Hulk DoS Slowhttptest DoS slowloris FTP-Patator Heartbleed PortScan SSH-Patator Web Attack - Brute Force Web Attack - XSS
BENIGN 451913 45 519 67 926 74 31 44 0 240 60 4 3
Bot 127 257 3 0 4 0 0 0 0 0 0 0 0
DDoS 121 0 25142 3 339 0 0 0 0 0 0 0 0
DoS GoldenEye 82 0 0 1973 1 0 0 1 0 0 0 2 0
DoS Hulk 656 0 277 1 45084 0 1 0 0 6 0 0 0
DoS Slowhttptest 36 0 0 0 0 1058 4 0 0 0 0 0 2
DoS slowloris 26 0 0 0 2 5 1122 0 0 2 1 1 0
FTP-Patator 29 0 0 0 0 0 0 1556 0 0 1 1 0
Heartbleed 0 0 0 0 0 0 0 0 2 0 0 0 0
PortScan 65 0 2 0 14 0 0 0 0 31678 0 0 1
SSH-Patator 23 0 0 0 0 0 0 1 0 0 1156 0 0
Web Attack - Brute Force 5 0 0 0 0 0 0 0 0 0 32 212 52
Web Attack - XSS 6 0 0 0 1 0 0 0 0 0 1 86 36
Malware Benign
Malware 113312 1176
Benign 2013 451913

precision = $\frac{TP}{TP + FP}$ = 0.983 \ recall = $\frac{TP}{TP + FN}$ = 0.990 \ false positive rate = $\frac{FP}{FP + TN}$ = 0.00443

In [401]:
# Calculate highest and lowest tpr's, fpr's

# Create empty dataframe like the confusion matrix
rates = pd.DataFrame(columns=['malware', 'samples', 'fp', 'fn', 'tpr', 'fpr', 'tpr_text', 'fpr_text'])

# Iterate over the non-BENIGN columns in cm
for i in range(1, cm.shape[1]):
    samples = np.sum(cm.iloc[i,:])
    fp = cm.iloc[0, i]
    fn = cm.iloc[i, 0]
    tpr = cm.iloc[i, i] / (cm.iloc[i, i] + cm.iloc[i, 0])
    fpr = cm.iloc[0, i] / (cm.iloc[0, i] + cm.iloc[0, 0])
    tpr_text = str(round(tpr * 100, 3)) + '% (' + str(fn) +' FNs)'
    fpr_text = str(round(fpr * 100, 3)) + '% (' + str(fp) +' FPs)'
    newRow = pd.DataFrame({'malware': cm.columns[i], 'samples': samples, 'fp': fp, 'fn': fn, 'tpr': tpr, 'fpr': fpr, \
        'tpr_text': tpr_text, 'fpr_text': fpr_text}, index=[i-1])
    rates = pd.concat([rates, newRow], axis=0)

display(rates)
malware samples fp fn tpr fpr tpr_text fpr_text
0 Bot 391 45 127 0.669271 0.0001 66.927% (127 FNs) 0.01% (45 FPs)
1 DDoS 25605 519 121 0.99521 0.001147 99.521% (121 FNs) 0.115% (519 FPs)
2 DoS GoldenEye 2059 67 82 0.960097 0.000148 96.01% (82 FNs) 0.015% (67 FPs)
3 DoS Hulk 46025 926 656 0.985658 0.002045 98.566% (656 FNs) 0.204% (926 FPs)
4 DoS Slowhttptest 1100 74 36 0.967093 0.000164 96.709% (36 FNs) 0.016% (74 FPs)
5 DoS slowloris 1159 31 26 0.977352 0.000069 97.735% (26 FNs) 0.007% (31 FPs)
6 FTP-Patator 1587 44 29 0.981703 0.000097 98.17% (29 FNs) 0.01% (44 FPs)
7 Heartbleed 2 0 0 1.0 0.0 100.0% (0 FNs) 0.0% (0 FPs)
8 PortScan 31760 240 65 0.997952 0.000531 99.795% (65 FNs) 0.053% (240 FPs)
9 SSH-Patator 1180 60 23 0.980492 0.000133 98.049% (23 FNs) 0.013% (60 FPs)
10 Web Attack - Brute Force 301 4 5 0.976959 0.000009 97.696% (5 FNs) 0.001% (4 FPs)
11 Web Attack - XSS 130 3 6 0.857143 0.000007 85.714% (6 FNs) 0.001% (3 FPs)
In [402]:
# Plot tpr
rates = rates.sort_values(by = ['tpr'], ascending=False)
fig1 = px.bar(rates, y='malware', x='tpr', template='plotly_white', orientation='h', \
    log_x=False, text='tpr_text')
fig1.update_traces(marker_color='black')
fig1.update_xaxes(title="True positive rate")
fig1.update_yaxes(title="Malware")
#fig1.update_layout(title={'text': 'Hardest malware to identify', 'xanchor': 'left', 'yanchor': 'top'})
fig1.show()

# Plot tpr
rates = rates.sort_values(by = ['fpr'], ascending=True)
fig2 = px.bar(rates, y='malware', x='fpr', template='plotly_white', orientation='h', \
    log_x=False, text='fpr_text')
fig2.update_traces(marker_color='black')
fig2.update_xaxes(title="False positive rate")
fig2.update_yaxes(title="Malware")
#fig2.update_layout(title={'text': 'Malware most prone to false positives', 'xanchor': 'left', 'yanchor': 'top'})
fig2.show()